skypilot-nightly 1.0.0.dev20250216__py3-none-any.whl → 1.0.0.dev20250218__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sky/__init__.py +48 -22
  2. sky/adaptors/aws.py +2 -1
  3. sky/adaptors/azure.py +4 -4
  4. sky/adaptors/cloudflare.py +4 -4
  5. sky/adaptors/kubernetes.py +8 -8
  6. sky/authentication.py +42 -45
  7. sky/backends/backend.py +2 -2
  8. sky/backends/backend_utils.py +108 -221
  9. sky/backends/cloud_vm_ray_backend.py +283 -282
  10. sky/benchmark/benchmark_utils.py +6 -2
  11. sky/check.py +40 -28
  12. sky/cli.py +1213 -1116
  13. sky/client/__init__.py +1 -0
  14. sky/client/cli.py +5644 -0
  15. sky/client/common.py +345 -0
  16. sky/client/sdk.py +1757 -0
  17. sky/cloud_stores.py +12 -6
  18. sky/clouds/__init__.py +0 -2
  19. sky/clouds/aws.py +20 -13
  20. sky/clouds/azure.py +5 -3
  21. sky/clouds/cloud.py +1 -1
  22. sky/clouds/cudo.py +2 -1
  23. sky/clouds/do.py +7 -3
  24. sky/clouds/fluidstack.py +3 -2
  25. sky/clouds/gcp.py +10 -8
  26. sky/clouds/ibm.py +8 -7
  27. sky/clouds/kubernetes.py +7 -6
  28. sky/clouds/lambda_cloud.py +8 -7
  29. sky/clouds/oci.py +4 -3
  30. sky/clouds/paperspace.py +2 -1
  31. sky/clouds/runpod.py +2 -1
  32. sky/clouds/scp.py +8 -7
  33. sky/clouds/service_catalog/__init__.py +3 -3
  34. sky/clouds/service_catalog/aws_catalog.py +7 -1
  35. sky/clouds/service_catalog/common.py +4 -2
  36. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
  37. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +1 -0
  38. sky/clouds/utils/oci_utils.py +1 -1
  39. sky/clouds/vast.py +2 -1
  40. sky/clouds/vsphere.py +2 -1
  41. sky/core.py +263 -99
  42. sky/dag.py +4 -0
  43. sky/data/mounting_utils.py +2 -1
  44. sky/data/storage.py +97 -35
  45. sky/data/storage_utils.py +69 -9
  46. sky/exceptions.py +138 -5
  47. sky/execution.py +47 -50
  48. sky/global_user_state.py +105 -22
  49. sky/jobs/__init__.py +12 -14
  50. sky/jobs/client/__init__.py +0 -0
  51. sky/jobs/client/sdk.py +296 -0
  52. sky/jobs/constants.py +30 -1
  53. sky/jobs/controller.py +12 -6
  54. sky/jobs/dashboard/dashboard.py +2 -6
  55. sky/jobs/recovery_strategy.py +22 -29
  56. sky/jobs/server/__init__.py +1 -0
  57. sky/jobs/{core.py → server/core.py} +101 -34
  58. sky/jobs/server/dashboard_utils.py +64 -0
  59. sky/jobs/server/server.py +182 -0
  60. sky/jobs/utils.py +32 -23
  61. sky/models.py +27 -0
  62. sky/optimizer.py +22 -22
  63. sky/provision/__init__.py +6 -3
  64. sky/provision/aws/config.py +2 -2
  65. sky/provision/aws/instance.py +1 -1
  66. sky/provision/azure/instance.py +1 -1
  67. sky/provision/cudo/instance.py +1 -1
  68. sky/provision/do/instance.py +1 -1
  69. sky/provision/do/utils.py +0 -5
  70. sky/provision/fluidstack/fluidstack_utils.py +4 -3
  71. sky/provision/fluidstack/instance.py +4 -2
  72. sky/provision/gcp/instance.py +1 -1
  73. sky/provision/instance_setup.py +2 -2
  74. sky/provision/kubernetes/constants.py +8 -0
  75. sky/provision/kubernetes/instance.py +1 -1
  76. sky/provision/kubernetes/utils.py +67 -76
  77. sky/provision/lambda_cloud/instance.py +3 -15
  78. sky/provision/logging.py +1 -1
  79. sky/provision/oci/instance.py +7 -4
  80. sky/provision/paperspace/instance.py +1 -1
  81. sky/provision/provisioner.py +3 -2
  82. sky/provision/runpod/instance.py +1 -1
  83. sky/provision/vast/instance.py +1 -1
  84. sky/provision/vsphere/instance.py +2 -11
  85. sky/resources.py +63 -47
  86. sky/serve/__init__.py +6 -10
  87. sky/serve/client/__init__.py +0 -0
  88. sky/serve/client/sdk.py +366 -0
  89. sky/serve/constants.py +3 -0
  90. sky/serve/replica_managers.py +10 -10
  91. sky/serve/serve_utils.py +56 -36
  92. sky/serve/server/__init__.py +0 -0
  93. sky/serve/{core.py → server/core.py} +37 -17
  94. sky/serve/server/server.py +117 -0
  95. sky/serve/service.py +8 -1
  96. sky/server/__init__.py +1 -0
  97. sky/server/common.py +442 -0
  98. sky/server/constants.py +21 -0
  99. sky/server/html/log.html +174 -0
  100. sky/server/requests/__init__.py +0 -0
  101. sky/server/requests/executor.py +462 -0
  102. sky/server/requests/payloads.py +481 -0
  103. sky/server/requests/queues/__init__.py +0 -0
  104. sky/server/requests/queues/mp_queue.py +76 -0
  105. sky/server/requests/requests.py +567 -0
  106. sky/server/requests/serializers/__init__.py +0 -0
  107. sky/server/requests/serializers/decoders.py +192 -0
  108. sky/server/requests/serializers/encoders.py +166 -0
  109. sky/server/server.py +1095 -0
  110. sky/server/stream_utils.py +144 -0
  111. sky/setup_files/MANIFEST.in +1 -0
  112. sky/setup_files/dependencies.py +12 -4
  113. sky/setup_files/setup.py +1 -1
  114. sky/sky_logging.py +9 -13
  115. sky/skylet/autostop_lib.py +2 -2
  116. sky/skylet/constants.py +46 -12
  117. sky/skylet/events.py +5 -6
  118. sky/skylet/job_lib.py +78 -66
  119. sky/skylet/log_lib.py +17 -11
  120. sky/skypilot_config.py +79 -94
  121. sky/task.py +119 -73
  122. sky/templates/aws-ray.yml.j2 +4 -4
  123. sky/templates/azure-ray.yml.j2 +3 -2
  124. sky/templates/cudo-ray.yml.j2 +3 -2
  125. sky/templates/fluidstack-ray.yml.j2 +3 -2
  126. sky/templates/gcp-ray.yml.j2 +3 -2
  127. sky/templates/ibm-ray.yml.j2 +3 -2
  128. sky/templates/jobs-controller.yaml.j2 +1 -12
  129. sky/templates/kubernetes-ray.yml.j2 +3 -2
  130. sky/templates/lambda-ray.yml.j2 +3 -2
  131. sky/templates/oci-ray.yml.j2 +3 -2
  132. sky/templates/paperspace-ray.yml.j2 +3 -2
  133. sky/templates/runpod-ray.yml.j2 +3 -2
  134. sky/templates/scp-ray.yml.j2 +3 -2
  135. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  136. sky/templates/vsphere-ray.yml.j2 +4 -2
  137. sky/templates/websocket_proxy.py +64 -0
  138. sky/usage/constants.py +8 -0
  139. sky/usage/usage_lib.py +45 -11
  140. sky/utils/accelerator_registry.py +33 -53
  141. sky/utils/admin_policy_utils.py +2 -1
  142. sky/utils/annotations.py +51 -0
  143. sky/utils/cli_utils/status_utils.py +33 -3
  144. sky/utils/cluster_utils.py +356 -0
  145. sky/utils/command_runner.py +69 -14
  146. sky/utils/common.py +74 -0
  147. sky/utils/common_utils.py +133 -93
  148. sky/utils/config_utils.py +204 -0
  149. sky/utils/control_master_utils.py +2 -3
  150. sky/utils/controller_utils.py +133 -147
  151. sky/utils/dag_utils.py +72 -24
  152. sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
  153. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  154. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  155. sky/utils/log_utils.py +83 -23
  156. sky/utils/message_utils.py +81 -0
  157. sky/utils/registry.py +127 -0
  158. sky/utils/resources_utils.py +2 -2
  159. sky/utils/rich_utils.py +213 -34
  160. sky/utils/schemas.py +19 -2
  161. sky/{status_lib.py → utils/status_lib.py} +12 -7
  162. sky/utils/subprocess_utils.py +51 -35
  163. sky/utils/timeline.py +7 -2
  164. sky/utils/ux_utils.py +95 -25
  165. {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/METADATA +8 -3
  166. {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/RECORD +170 -132
  167. sky/clouds/cloud_registry.py +0 -76
  168. sky/utils/cluster_yaml_utils.py +0 -24
  169. {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/LICENSE +0 -0
  170. {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250216.dist-info → skypilot_nightly-1.0.0.dev20250218.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,6 @@
1
1
  """Backend: runs on cloud virtual machines, managed by Ray."""
2
2
  import copy
3
3
  import enum
4
- import functools
5
- import getpass
6
4
  import inspect
7
5
  import json
8
6
  import math
@@ -37,7 +35,6 @@ from sky import optimizer
37
35
  from sky import provision as provision_lib
38
36
  from sky import resources as resources_lib
39
37
  from sky import sky_logging
40
- from sky import status_lib
41
38
  from sky import task as task_lib
42
39
  from sky.backends import backend_utils
43
40
  from sky.backends import wheel_utils
@@ -45,24 +42,30 @@ from sky.clouds import service_catalog
45
42
  from sky.clouds.utils import gcp_utils
46
43
  from sky.data import data_utils
47
44
  from sky.data import storage as storage_lib
48
- from sky.jobs import constants as managed_jobs_constants
49
45
  from sky.provision import common as provision_common
50
46
  from sky.provision import instance_setup
51
47
  from sky.provision import metadata_utils
52
48
  from sky.provision import provisioner
53
49
  from sky.provision.kubernetes import utils as kubernetes_utils
50
+ from sky.server.requests import requests as requests_lib
54
51
  from sky.skylet import autostop_lib
55
52
  from sky.skylet import constants
56
53
  from sky.skylet import job_lib
57
54
  from sky.skylet import log_lib
58
55
  from sky.usage import usage_lib
59
56
  from sky.utils import accelerator_registry
57
+ from sky.utils import annotations
58
+ from sky.utils import cluster_utils
60
59
  from sky.utils import command_runner
60
+ from sky.utils import common
61
61
  from sky.utils import common_utils
62
62
  from sky.utils import controller_utils
63
63
  from sky.utils import log_utils
64
+ from sky.utils import message_utils
65
+ from sky.utils import registry
64
66
  from sky.utils import resources_utils
65
67
  from sky.utils import rich_utils
68
+ from sky.utils import status_lib
66
69
  from sky.utils import subprocess_utils
67
70
  from sky.utils import timeline
68
71
  from sky.utils import ux_utils
@@ -152,9 +155,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
152
155
  # If the command is too long, we instead write it to a file, rsync and execute
153
156
  # it.
154
157
  #
155
- # We use 120KB as a threshold to be safe for other arguments that
158
+ # We use 100KB as a threshold to be safe for other arguments that
156
159
  # might be added during ssh.
157
- _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
160
+ _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
158
161
 
159
162
  _RESOURCES_UNAVAILABLE_LOG = (
160
163
  'Reasons for provision failures (for details, please check the log above):')
@@ -194,7 +197,7 @@ def _get_cluster_config_template(cloud):
194
197
 
195
198
 
196
199
  def write_ray_up_script_with_patched_launch_hash_fn(
197
- cluster_config_path: str,
200
+ cluster_config_path: Optional[str],
198
201
  ray_up_kwargs: Dict[str, bool],
199
202
  ) -> str:
200
203
  """Writes a Python script that runs `ray up` with our launch hash func.
@@ -1181,7 +1184,7 @@ class RetryingVmProvisioner(object):
1181
1184
  def __init__(self,
1182
1185
  log_dir: str,
1183
1186
  dag: 'dag.Dag',
1184
- optimize_target: 'optimizer.OptimizeTarget',
1187
+ optimize_target: 'common.OptimizeTarget',
1185
1188
  requested_features: Set[clouds.CloudImplementationFeatures],
1186
1189
  local_wheel_path: pathlib.Path,
1187
1190
  wheel_hash: str,
@@ -1554,6 +1557,7 @@ class RetryingVmProvisioner(object):
1554
1557
  f'{to_provision.cloud} '
1555
1558
  f'{region.name}{colorama.Style.RESET_ALL}'
1556
1559
  f'{zone_str}.'))
1560
+ assert handle.cluster_yaml is not None
1557
1561
  provision_record = provisioner.bulk_provision(
1558
1562
  to_provision.cloud,
1559
1563
  region,
@@ -1586,7 +1590,9 @@ class RetryingVmProvisioner(object):
1586
1590
  # cluster does not exist. Also we are fast at
1587
1591
  # cleaning up clusters now if there is no existing node..
1588
1592
  CloudVmRayBackend().post_teardown_cleanup(
1589
- handle, terminate=not prev_cluster_ever_up)
1593
+ handle,
1594
+ terminate=not prev_cluster_ever_up,
1595
+ remove_from_db=False)
1590
1596
  # TODO(suquark): other clouds may have different zone
1591
1597
  # blocking strategy. See '_update_blocklist_on_error'
1592
1598
  # for details.
@@ -1703,7 +1709,8 @@ class RetryingVmProvisioner(object):
1703
1709
  # autoscaler proceeds to setup commands, which may fail:
1704
1710
  # ERR updater.py:138 -- New status: update-failed
1705
1711
  CloudVmRayBackend().teardown_no_lock(handle,
1706
- terminate=terminate_or_stop)
1712
+ terminate=terminate_or_stop,
1713
+ remove_from_db=False)
1707
1714
 
1708
1715
  if to_provision.zone is not None:
1709
1716
  message = (
@@ -2130,7 +2137,7 @@ class RetryingVmProvisioner(object):
2130
2137
  # TODO: set all remaining tasks' best_resources to None.
2131
2138
  task.best_resources = None
2132
2139
  try:
2133
- self._dag = sky.optimize(
2140
+ self._dag = optimizer.Optimizer.optimize(
2134
2141
  self._dag,
2135
2142
  minimize=self._optimize_target,
2136
2143
  blocked_resources=self._blocked_resources)
@@ -2176,14 +2183,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2176
2183
  """
2177
2184
  # Bump if any fields get added/removed/changed, and add backward
2178
2185
  # compaitibility logic in __setstate__.
2179
- _VERSION = 9
2186
+ _VERSION = 10
2180
2187
 
2181
2188
  def __init__(
2182
2189
  self,
2183
2190
  *,
2184
2191
  cluster_name: str,
2185
2192
  cluster_name_on_cloud: str,
2186
- cluster_yaml: str,
2193
+ cluster_yaml: Optional[str],
2187
2194
  launched_nodes: int,
2188
2195
  launched_resources: resources_lib.Resources,
2189
2196
  stable_internal_external_ips: Optional[List[Tuple[str,
@@ -2196,7 +2203,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2196
2203
  self.cluster_name_on_cloud = cluster_name_on_cloud
2197
2204
  # Replace the home directory with ~ for better robustness across systems
2198
2205
  # with different home directories.
2199
- if cluster_yaml.startswith(os.path.expanduser('~')):
2206
+ if cluster_yaml is not None and cluster_yaml.startswith(
2207
+ os.path.expanduser('~')):
2200
2208
  cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~', 1)
2201
2209
  self._cluster_yaml = cluster_yaml
2202
2210
  # List of (internal_ip, feasible_ip) tuples for all the nodes in the
@@ -2403,7 +2411,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2403
2411
  internal_external_ips[1:], key=lambda x: x[1])
2404
2412
  self.stable_internal_external_ips = stable_internal_external_ips
2405
2413
 
2406
- @functools.lru_cache()
2414
+ @annotations.lru_cache(scope='global')
2407
2415
  @timeline.event
2408
2416
  def get_command_runners(self,
2409
2417
  force_cached: bool = False,
@@ -2520,9 +2528,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2520
2528
  self.docker_user = docker_user
2521
2529
 
2522
2530
  @property
2523
- def cluster_yaml(self):
2531
+ def cluster_yaml(self) -> Optional[str]:
2532
+ if self._cluster_yaml is None:
2533
+ return None
2524
2534
  return os.path.expanduser(self._cluster_yaml)
2525
2535
 
2536
+ @cluster_yaml.setter
2537
+ def cluster_yaml(self, value: Optional[str]):
2538
+ self._cluster_yaml = value
2539
+
2526
2540
  @property
2527
2541
  def ssh_user(self):
2528
2542
  if self.cached_cluster_info is not None:
@@ -2594,6 +2608,22 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2594
2608
  state['launched_resources'] = launched_resources.copy(
2595
2609
  region=context)
2596
2610
 
2611
+ if version < 10:
2612
+ # In #4660, we keep the cluster entry in the database even when it
2613
+ # is in the transition from one region to another during the
2614
+ # failover. We allow `handle.cluster_yaml` to be None to indicate
2615
+ # that the cluster yaml is intentionally removed. Before that PR,
2616
+ # the `handle.cluster_yaml` is always not None, even if it is
2617
+ # intentionally removed.
2618
+ #
2619
+ # For backward compatibility, we set the `_cluster_yaml` to None
2620
+ # if the file does not exist, assuming all the removal of the
2621
+ # _cluster_yaml for existing clusters are intentional by SkyPilot.
2622
+ # are intentional by SkyPilot.
2623
+ if state['_cluster_yaml'] is not None and not os.path.exists(
2624
+ os.path.expanduser(state['_cluster_yaml'])):
2625
+ state['_cluster_yaml'] = None
2626
+
2597
2627
  self.__dict__.update(state)
2598
2628
 
2599
2629
  # Because the update_cluster_ips and update_ssh_ports
@@ -2618,6 +2648,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2618
2648
  pass
2619
2649
 
2620
2650
 
2651
+ @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2621
2652
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2622
2653
  """Backend: runs on cloud virtual machines, managed by Ray.
2623
2654
 
@@ -2647,7 +2678,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2647
2678
 
2648
2679
  # Command for running the setup script. It is only set when the
2649
2680
  # setup needs to be run outside the self._setup() and as part of
2650
- # a job (--detach-setup).
2681
+ # a job (detach_setup, default).
2651
2682
  self._setup_cmd = None
2652
2683
 
2653
2684
  # --- Implementation of Backend APIs ---
@@ -2656,7 +2687,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2656
2687
  self._dag = kwargs.pop('dag', self._dag)
2657
2688
  self._optimize_target = kwargs.pop(
2658
2689
  'optimize_target',
2659
- self._optimize_target) or optimizer.OptimizeTarget.COST
2690
+ self._optimize_target) or common.OptimizeTarget.COST
2660
2691
  self._requested_features = kwargs.pop('requested_features',
2661
2692
  self._requested_features)
2662
2693
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
@@ -2872,21 +2903,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2872
2903
  skip_unnecessary_provisioning)
2873
2904
  break
2874
2905
  except exceptions.ResourcesUnavailableError as e:
2875
- # Do not remove the stopped cluster from the global state
2876
- # if failed to start.
2906
+ log_path = retry_provisioner.log_dir + '/provision.log'
2907
+ error_message = (
2908
+ f'{colorama.Fore.RED}Failed to provision all '
2909
+ f'possible launchable resources.'
2910
+ f'{colorama.Style.RESET_ALL}'
2911
+ ' Relax the task\'s resource requirements: '
2912
+ f'{task.num_nodes}x {list(task.resources)[0]}')
2877
2913
  if e.no_failover:
2878
2914
  error_message = str(e)
2879
- else:
2880
- usage_lib.messages.usage.update_final_cluster_status(
2881
- None)
2882
- error_message = (
2883
- f'{colorama.Fore.RED}Failed to provision all '
2884
- f'possible launchable resources.'
2885
- f'{colorama.Style.RESET_ALL}'
2886
- ' Relax the task\'s resource requirements: '
2887
- f'{task.num_nodes}x {list(task.resources)[0]}')
2888
2915
 
2889
- log_path = retry_provisioner.log_dir + '/provision.log'
2890
2916
  if retry_until_up:
2891
2917
  logger.error(error_message)
2892
2918
  # Sleep and retry.
@@ -2901,6 +2927,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2901
2927
  attempt_cnt += 1
2902
2928
  time.sleep(gap_seconds)
2903
2929
  continue
2930
+ # Clean up the cluster's entry in `sky status`.
2931
+ # Do not remove the stopped cluster from the global state
2932
+ # if failed to start.
2933
+ if not e.no_failover:
2934
+ global_user_state.remove_cluster(cluster_name,
2935
+ terminate=True)
2936
+ usage_lib.messages.usage.update_final_cluster_status(
2937
+ None)
2904
2938
  logger.error(
2905
2939
  ux_utils.error_message(
2906
2940
  'Failed to provision resources. '
@@ -2966,8 +3000,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2966
3000
 
2967
3001
  self._update_after_cluster_provisioned(
2968
3002
  handle, to_provision_config.prev_handle, task,
2969
- prev_cluster_status, handle.external_ips(),
2970
- handle.external_ssh_ports(), lock_path, config_hash)
3003
+ prev_cluster_status, lock_path, config_hash)
2971
3004
  return handle
2972
3005
 
2973
3006
  cluster_config_file = config_dict['ray']
@@ -3039,8 +3072,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3039
3072
 
3040
3073
  self._update_after_cluster_provisioned(
3041
3074
  handle, to_provision_config.prev_handle, task,
3042
- prev_cluster_status, ip_list, ssh_port_list, lock_path,
3043
- config_hash)
3075
+ prev_cluster_status, lock_path, config_hash)
3044
3076
  return handle
3045
3077
 
3046
3078
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3058,8 +3090,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3058
3090
  prev_handle: Optional[CloudVmRayResourceHandle],
3059
3091
  task: task_lib.Task,
3060
3092
  prev_cluster_status: Optional[status_lib.ClusterStatus],
3061
- ip_list: List[str], ssh_port_list: List[int], lock_path: str,
3062
- config_hash: str) -> None:
3093
+ lock_path: str, config_hash: str) -> None:
3063
3094
  usage_lib.messages.usage.update_cluster_resources(
3064
3095
  handle.launched_nodes, handle.launched_resources)
3065
3096
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3123,15 +3154,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3123
3154
  )
3124
3155
  usage_lib.messages.usage.update_final_cluster_status(
3125
3156
  status_lib.ClusterStatus.UP)
3157
+ # We still add the cluster to ssh config file on API server, this
3158
+ # is helpful for people trying to use `sky launch`'ed cluster for
3159
+ # ssh proxy jump.
3126
3160
  auth_config = backend_utils.ssh_credential_from_yaml(
3127
3161
  handle.cluster_yaml,
3128
3162
  ssh_user=handle.ssh_user,
3129
3163
  docker_user=handle.docker_user)
3130
- backend_utils.SSHConfigHelper.add_cluster(handle.cluster_name,
3131
- ip_list, auth_config,
3132
- ssh_port_list,
3133
- handle.docker_user,
3134
- handle.ssh_user)
3164
+ cluster_utils.SSHConfigHelper.add_cluster(
3165
+ handle.cluster_name, handle.cached_external_ips, auth_config,
3166
+ handle.cached_external_ssh_ports, handle.docker_user,
3167
+ handle.ssh_user)
3135
3168
 
3136
3169
  common_utils.remove_file_if_exists(lock_path)
3137
3170
 
@@ -3192,7 +3225,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3192
3225
  ux_utils.spinner_message('Syncing workdir', log_path)):
3193
3226
  subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
3194
3227
  num_threads)
3195
- logger.info(ux_utils.finishing_message('Workdir synced.', log_path))
3228
+ logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
3196
3229
 
3197
3230
  def _sync_file_mounts(
3198
3231
  self,
@@ -3346,9 +3379,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3346
3379
 
3347
3380
  if detach_setup:
3348
3381
  # Only set this when setup needs to be run outside the self._setup()
3349
- # as part of a job (--detach-setup).
3382
+ # as part of a job (detach_setup, default).
3350
3383
  self._setup_cmd = setup_cmd
3351
- logger.info(ux_utils.finishing_message('Setup completed.'))
3384
+ logger.info(ux_utils.finishing_message('Setup detached.'))
3352
3385
  return
3353
3386
  end = time.time()
3354
3387
  logger.debug(f'Setup took {end - start} seconds.')
@@ -3365,9 +3398,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3365
3398
  managed_job_dag: Optional['dag.Dag'] = None,
3366
3399
  ) -> None:
3367
3400
  """Executes generated code on the head node."""
3368
- style = colorama.Style
3369
- fore = colorama.Fore
3370
-
3371
3401
  script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3372
3402
  remote_log_dir = self.log_dir
3373
3403
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
@@ -3457,58 +3487,22 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3457
3487
  logger.info(
3458
3488
  ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
3459
3489
  rich_utils.stop_safe_status()
3460
- try:
3461
- if not detach_run:
3462
- if (handle.cluster_name in controller_utils.Controllers.
3463
- JOBS_CONTROLLER.value.candidate_cluster_names):
3464
- self.tail_managed_job_logs(handle, job_id)
3465
- else:
3466
- # Sky logs. Not using subprocess.run since it will make the
3467
- # ssh keep connected after ctrl-c.
3468
- self.tail_logs(handle, job_id)
3469
- finally:
3470
- name = handle.cluster_name
3471
- controller = controller_utils.Controllers.from_name(name)
3472
- if controller == controller_utils.Controllers.JOBS_CONTROLLER:
3473
- logger.info(
3474
- f'\n{fore.CYAN}Managed Job ID: '
3475
- f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3476
- f'\n📋 Useful Commands'
3477
- f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t\t'
3478
- f'{ux_utils.BOLD}sky jobs cancel {job_id}'
3479
- f'{ux_utils.RESET_BOLD}'
3480
- f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
3481
- f'{ux_utils.BOLD}sky jobs logs {job_id}'
3482
- f'{ux_utils.RESET_BOLD}'
3483
- f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
3484
- f'{ux_utils.BOLD}sky jobs logs --controller {job_id}'
3485
- f'{ux_utils.RESET_BOLD}'
3486
- f'\n{ux_utils.INDENT_SYMBOL}To view all managed jobs:\t\t'
3487
- f'{ux_utils.BOLD}sky jobs queue'
3488
- f'{ux_utils.RESET_BOLD}'
3489
- f'\n{ux_utils.INDENT_LAST_SYMBOL}To view managed job '
3490
- f'dashboard:\t{ux_utils.BOLD}sky jobs dashboard'
3491
- f'{ux_utils.RESET_BOLD}')
3492
- elif controller is None:
3493
- logger.info(f'\n{fore.CYAN}Job ID: '
3494
- f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3495
- f'\n📋 Useful Commands'
3496
- f'\n{ux_utils.INDENT_SYMBOL}To cancel the job:\t\t'
3497
- f'{ux_utils.BOLD}sky cancel {name} {job_id}'
3498
- f'{ux_utils.RESET_BOLD}'
3499
- f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t'
3500
- f'{ux_utils.BOLD}sky logs {name} {job_id}'
3501
- f'{ux_utils.RESET_BOLD}'
3502
- f'\n{ux_utils.INDENT_LAST_SYMBOL}To view job '
3503
- 'queue:\t\t'
3504
- f'{ux_utils.BOLD}sky queue {name}'
3505
- f'{ux_utils.RESET_BOLD}')
3490
+ if not detach_run:
3491
+ if (handle.cluster_name == controller_utils.Controllers.
3492
+ JOBS_CONTROLLER.value.cluster_name):
3493
+ self.tail_managed_job_logs(handle, job_id)
3494
+ else:
3495
+ # Sky logs. Not using subprocess.run since it will make the
3496
+ # ssh keep connected after ctrl-c.
3497
+ self.tail_logs(handle, job_id)
3506
3498
 
3507
3499
  def _add_job(self, handle: CloudVmRayResourceHandle,
3508
3500
  job_name: Optional[str], resources_str: str) -> int:
3509
- username = getpass.getuser()
3510
- code = job_lib.JobLibCodeGen.add_job(job_name, username,
3511
- self.run_timestamp, resources_str)
3501
+ code = job_lib.JobLibCodeGen.add_job(
3502
+ job_name=job_name,
3503
+ username=common_utils.get_user_hash(),
3504
+ run_timestamp=self.run_timestamp,
3505
+ resources_str=resources_str)
3512
3506
  returncode, job_id_str, stderr = self.run_on_head(handle,
3513
3507
  code,
3514
3508
  stream_logs=False,
@@ -3548,13 +3542,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3548
3542
  Job id if the task is submitted to the cluster, None otherwise.
3549
3543
  """
3550
3544
  if task.run is None and self._setup_cmd is None:
3551
- # This message is fine without mentioning setup, as there are three
3545
+ # This message is fine without mentioning setup, as there are two
3552
3546
  # cases when run section is empty:
3553
- # 1. setup specified, no --detach-setup: setup is executed and this
3554
- # message is fine for saying no run command specified.
3555
- # 2. setup specified, with --detach-setup: setup is executed in
3556
- # detached mode and this message will not be shown.
3557
- # 3. no setup specified: this message is fine as a user is likely
3547
+ # 1. setup specified: setup is executed in detached mode and this
3548
+ # message will not be shown.
3549
+ # 2. no setup specified: this message is fine as a user is likely
3558
3550
  # creating a cluster only, and ok with the empty run command.
3559
3551
  logger.info('Run commands not specified or empty.')
3560
3552
  return None
@@ -3601,26 +3593,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3601
3593
 
3602
3594
  def _post_execute(self, handle: CloudVmRayResourceHandle,
3603
3595
  down: bool) -> None:
3604
- name = handle.cluster_name
3605
- controller = controller_utils.Controllers.from_name(name)
3606
- if controller is not None:
3607
- return
3608
- logger.info(f'\nCluster name: {name}'
3609
- f'\n{ux_utils.INDENT_SYMBOL}To log into the head VM:\t'
3610
- f'{ux_utils.BOLD}ssh {name}'
3611
- f'{ux_utils.RESET_BOLD}'
3612
- f'\n{ux_utils.INDENT_SYMBOL}To submit a job:'
3613
- f'\t\t{ux_utils.BOLD}sky exec {name} yaml_file'
3614
- f'{ux_utils.RESET_BOLD}'
3615
- f'\n{ux_utils.INDENT_SYMBOL}To stop the cluster:'
3616
- f'\t{ux_utils.BOLD}sky stop {name}'
3617
- f'{ux_utils.RESET_BOLD}'
3618
- f'\n{ux_utils.INDENT_LAST_SYMBOL}To teardown the cluster:'
3619
- f'\t{ux_utils.BOLD}sky down {name}'
3620
- f'{ux_utils.RESET_BOLD}')
3621
- if (gcp_utils.is_tpu(handle.launched_resources) and
3622
- not gcp_utils.is_tpu_vm(handle.launched_resources)):
3623
- logger.info('Tip: `sky down` will delete launched TPU(s) too.')
3596
+ """Post-execute cleanup."""
3597
+ del handle, down # Unused.
3598
+ # All logic is handled in previous stages, no-op.
3624
3599
 
3625
3600
  def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None:
3626
3601
  storage_mounts = task.storage_mounts
@@ -3668,30 +3643,47 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3668
3643
  is_identity_mismatch_and_purge = True
3669
3644
  else:
3670
3645
  raise
3671
-
3672
3646
  lock_path = os.path.expanduser(
3673
3647
  backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
3674
-
3675
- try:
3676
- with timeline.FileLockEvent(
3677
- lock_path,
3678
- backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
3679
- self.teardown_no_lock(
3680
- handle,
3681
- terminate,
3682
- purge,
3683
- # When --purge is set and we already see an ID mismatch
3684
- # error, we skip the refresh codepath. This is because
3685
- # refresh checks current user identity can throw
3686
- # ClusterOwnerIdentityMismatchError. The argument/flag
3687
- # `purge` should bypass such ID mismatch errors.
3688
- refresh_cluster_status=not is_identity_mismatch_and_purge)
3689
- if terminate:
3690
- common_utils.remove_file_if_exists(lock_path)
3691
- except filelock.Timeout as e:
3692
- raise RuntimeError(
3693
- f'Cluster {cluster_name!r} is locked by {lock_path}. '
3694
- 'Check to see if it is still being launched') from e
3648
+ # Retry in case new cluster operation comes in and holds the lock
3649
+ # right after the lock is removed.
3650
+ n_attempts = 2
3651
+ while True:
3652
+ n_attempts -= 1
3653
+ # In case other running cluster operations are still holding the
3654
+ # lock.
3655
+ common_utils.remove_file_if_exists(lock_path)
3656
+ # We have to kill the cluster requests, because `down` and `stop`
3657
+ # should be higher priority than the cluster requests, and we should
3658
+ # release the lock from other requests.
3659
+ exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
3660
+ requests_lib.kill_cluster_requests(handle.cluster_name,
3661
+ exclude_request_to_kill)
3662
+ try:
3663
+ with filelock.FileLock(
3664
+ lock_path,
3665
+ backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
3666
+ self.teardown_no_lock(
3667
+ handle,
3668
+ terminate,
3669
+ purge,
3670
+ # When --purge is set and we already see an ID mismatch
3671
+ # error, we skip the refresh codepath. This is because
3672
+ # refresh checks current user identity can throw
3673
+ # ClusterOwnerIdentityMismatchError. The argument/flag
3674
+ # `purge` should bypass such ID mismatch errors.
3675
+ refresh_cluster_status=(
3676
+ not is_identity_mismatch_and_purge))
3677
+ if terminate:
3678
+ common_utils.remove_file_if_exists(lock_path)
3679
+ break
3680
+ except filelock.Timeout as e:
3681
+ logger.debug(f'Failed to acquire lock for {cluster_name}, '
3682
+ f'retrying...')
3683
+ if n_attempts <= 0:
3684
+ raise RuntimeError(
3685
+ f'Cluster {cluster_name!r} is locked by {lock_path}. '
3686
+ 'Check to see if it is still being launched') from e
3695
3687
 
3696
3688
  # --- CloudVMRayBackend Specific APIs ---
3697
3689
 
@@ -3715,24 +3707,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3715
3707
  def cancel_jobs(self,
3716
3708
  handle: CloudVmRayResourceHandle,
3717
3709
  jobs: Optional[List[int]],
3718
- cancel_all: bool = False) -> None:
3710
+ cancel_all: bool = False,
3711
+ user_hash: Optional[str] = None) -> None:
3719
3712
  """Cancels jobs.
3720
3713
 
3721
- CloudVMRayBackend specific method.
3722
-
3723
- Args:
3724
- handle: The cluster handle.
3725
- jobs: Job IDs to cancel. (See `cancel_all` for special semantics.)
3726
- cancel_all: Whether to cancel all jobs. If True, asserts `jobs` is
3727
- set to None. If False and `jobs` is None, cancel the latest
3728
- running job.
3714
+ See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
3729
3715
  """
3730
- if cancel_all:
3731
- assert jobs is None, (
3732
- 'If cancel_all=True, usage is to set jobs=None')
3733
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all)
3734
-
3735
- # All error messages should have been redirected to stdout.
3716
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
3736
3717
  returncode, stdout, _ = self.run_on_head(handle,
3737
3718
  code,
3738
3719
  stream_logs=False,
@@ -3741,13 +3722,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3741
3722
  returncode, code,
3742
3723
  f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
3743
3724
 
3744
- cancelled_ids = common_utils.decode_payload(stdout)
3725
+ cancelled_ids = message_utils.decode_payload(stdout)
3745
3726
  if cancelled_ids:
3746
3727
  logger.info(
3747
3728
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
3748
3729
  else:
3749
- logger.info(
3750
- 'No jobs cancelled. They may already be in terminal states.')
3730
+ logger.info('No jobs cancelled. They may be in terminal states.')
3751
3731
 
3752
3732
  def sync_down_logs(
3753
3733
  self,
@@ -3768,7 +3748,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3768
3748
  separate_stderr=True)
3769
3749
  subprocess_utils.handle_returncode(returncode, code,
3770
3750
  'Failed to sync logs.', stderr)
3771
- run_timestamps = common_utils.decode_payload(run_timestamps)
3751
+ run_timestamps = message_utils.decode_payload(run_timestamps)
3772
3752
  if not run_timestamps:
3773
3753
  logger.info(f'{colorama.Fore.YELLOW}'
3774
3754
  'No matching log directories found'
@@ -3782,16 +3762,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3782
3762
  for run_timestamp in run_timestamps
3783
3763
  ]
3784
3764
  local_log_dirs = [
3785
- os.path.expanduser(os.path.join(local_dir, run_timestamp))
3765
+ os.path.join(local_dir, run_timestamp)
3786
3766
  for run_timestamp in run_timestamps
3787
3767
  ]
3788
3768
 
3789
- style = colorama.Style
3790
- fore = colorama.Fore
3791
- for job_id, log_dir in zip(job_ids, local_log_dirs):
3792
- logger.info(f'{fore.CYAN}Job {job_id} logs: {log_dir}'
3793
- f'{style.RESET_ALL}')
3794
-
3795
3769
  runners = handle.get_command_runners()
3796
3770
 
3797
3771
  def _rsync_down(args) -> None:
@@ -3802,13 +3776,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3802
3776
  """
3803
3777
  (runner, local_log_dir, remote_log_dir) = args
3804
3778
  try:
3805
- os.makedirs(local_log_dir, exist_ok=True)
3779
+ os.makedirs(os.path.expanduser(local_log_dir), exist_ok=True)
3806
3780
  runner.rsync(
3807
3781
  # Require a `/` at the end to make sure the parent dir
3808
3782
  # are not created locally. We do not add additional '*' as
3809
3783
  # kubernetes's rsync does not work with an ending '*'.
3810
3784
  source=f'{remote_log_dir}/',
3811
- target=local_log_dir,
3785
+ target=os.path.expanduser(local_log_dir),
3812
3786
  up=False,
3813
3787
  stream_logs=False,
3814
3788
  )
@@ -3864,10 +3838,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3864
3838
  # Allocate a pseudo-terminal to disable output buffering.
3865
3839
  # Otherwise, there may be 5 minutes delay in logging.
3866
3840
  ssh_mode=command_runner.SshMode.INTERACTIVE,
3867
- # Disable stdin to avoid ray outputs mess up the terminal with
3868
- # misaligned output in multithreading/multiprocessing.
3869
- # Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long
3870
- stdin=subprocess.DEVNULL,
3871
3841
  )
3872
3842
  except SystemExit as e:
3873
3843
  returncode = e.code
@@ -3897,7 +3867,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3897
3867
  stream_logs=True,
3898
3868
  process_stream=False,
3899
3869
  ssh_mode=command_runner.SshMode.INTERACTIVE,
3900
- stdin=subprocess.DEVNULL,
3901
3870
  )
3902
3871
 
3903
3872
  def sync_down_managed_job_logs(
@@ -3936,7 +3905,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3936
3905
  subprocess_utils.handle_returncode(returncode, code,
3937
3906
  'Failed to sync down logs.',
3938
3907
  stderr)
3939
- job_ids = common_utils.decode_payload(job_ids)
3908
+ job_ids = message_utils.decode_payload(job_ids)
3940
3909
  if not job_ids:
3941
3910
  logger.info(f'{colorama.Fore.YELLOW}'
3942
3911
  'No matching job found'
@@ -3947,9 +3916,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3947
3916
  if job_name is not None:
3948
3917
  name_str = ('Multiple jobs IDs found under the name '
3949
3918
  f'{job_name}. ')
3919
+ controller_str = ' (controller)' if controller else ''
3950
3920
  logger.info(f'{colorama.Fore.YELLOW}'
3951
3921
  f'{name_str}'
3952
- 'Downloading the latest job logs.'
3922
+ f'Downloading the latest job logs{controller_str}.'
3953
3923
  f'{colorama.Style.RESET_ALL}')
3954
3924
  # list should aready be in descending order
3955
3925
  job_id = job_ids[0]
@@ -3967,7 +3937,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3967
3937
  subprocess_utils.handle_returncode(returncode, code,
3968
3938
  'Failed to sync logs.', stderr)
3969
3939
  # returns with a dict of {job_id: run_timestamp}
3970
- run_timestamps = common_utils.decode_payload(run_timestamps)
3940
+ run_timestamps = message_utils.decode_payload(run_timestamps)
3971
3941
  if not run_timestamps:
3972
3942
  logger.info(f'{colorama.Fore.YELLOW}'
3973
3943
  'No matching log directories found'
@@ -3978,15 +3948,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3978
3948
  job_id = list(run_timestamps.keys())[0]
3979
3949
  local_log_dir = ''
3980
3950
  if controller: # download controller logs
3981
- remote_log = os.path.join(
3982
- managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
3983
- f'{job_id}.log')
3984
- local_log_dir = os.path.expanduser(
3985
- os.path.join(local_dir, run_timestamp))
3986
-
3987
- logger.info(f'{colorama.Fore.CYAN}'
3988
- f'Job {job_id} local logs: {local_log_dir}'
3989
- f'{colorama.Style.RESET_ALL}')
3951
+ remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
3952
+ f'{job_id}.log')
3953
+ local_log_dir = os.path.join(local_dir, run_timestamp)
3954
+ os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
3955
+ exist_ok=True)
3956
+
3957
+ logger.debug(f'{colorama.Fore.CYAN}'
3958
+ f'Job {job_id} local logs: {local_log_dir}'
3959
+ f'{colorama.Style.RESET_ALL}')
3990
3960
 
3991
3961
  runners = handle.get_command_runners()
3992
3962
 
@@ -3998,7 +3968,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3998
3968
  """
3999
3969
  (runner, local_log_dir, remote_log) = args
4000
3970
  try:
4001
- os.makedirs(local_log_dir, exist_ok=True)
3971
+ os.makedirs(os.path.expanduser(local_log_dir),
3972
+ exist_ok=True)
4002
3973
  runner.rsync(
4003
3974
  source=remote_log,
4004
3975
  target=f'{local_log_dir}/controller.log',
@@ -4019,9 +3990,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4019
3990
  ]
4020
3991
  subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
4021
3992
  else: # download job logs
4022
- local_log_dir = os.path.expanduser(
4023
- os.path.join(local_dir, 'managed_jobs', run_timestamp))
4024
- os.makedirs(os.path.dirname(local_log_dir), exist_ok=True)
3993
+ local_log_dir = os.path.join(local_dir, 'managed_jobs',
3994
+ run_timestamp)
3995
+ os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
3996
+ exist_ok=True)
4025
3997
  log_file = os.path.join(local_log_dir, 'run.log')
4026
3998
 
4027
3999
  code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
@@ -4040,16 +4012,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4040
4012
  self.run_on_head(
4041
4013
  handle,
4042
4014
  code,
4043
- log_path=log_file,
4015
+ log_path=os.path.expanduser(log_file),
4044
4016
  stream_logs=False,
4045
4017
  process_stream=False,
4046
4018
  ssh_mode=command_runner.SshMode.INTERACTIVE,
4047
- stdin=subprocess.DEVNULL,
4048
4019
  )
4049
4020
 
4050
- logger.info(f'{colorama.Fore.CYAN}'
4051
- f'Job {job_id} logs: {local_log_dir}'
4052
- f'{colorama.Style.RESET_ALL}')
4021
+ logger.debug(f'{colorama.Fore.CYAN}'
4022
+ f'Job {job_id} logs: {local_log_dir}'
4023
+ f'{colorama.Style.RESET_ALL}')
4053
4024
  return {str(job_id): local_log_dir}
4054
4025
 
4055
4026
  def teardown_no_lock(self,
@@ -4057,7 +4028,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4057
4028
  terminate: bool,
4058
4029
  purge: bool = False,
4059
4030
  post_teardown_cleanup: bool = True,
4060
- refresh_cluster_status: bool = True) -> None:
4031
+ refresh_cluster_status: bool = True,
4032
+ remove_from_db: bool = True) -> None:
4061
4033
  """Teardown the cluster without acquiring the cluster status lock.
4062
4034
 
4063
4035
  NOTE: This method should not be called without holding the cluster
@@ -4069,6 +4041,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4069
4041
  Raises:
4070
4042
  RuntimeError: If the cluster fails to be terminated/stopped.
4071
4043
  """
4044
+ exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
4045
+ # We have to kill the cluster requests again within the lock, because
4046
+ # any pending requests on the same cluster should be cancelled after
4047
+ # the cluster is terminated/stopped. Otherwise, it will be quite
4048
+ # confusing to see the cluster restarted immediately after it is
4049
+ # terminated/stopped, when there is a pending launch request.
4050
+ requests_lib.kill_cluster_requests(handle.cluster_name,
4051
+ exclude_request_to_kill)
4072
4052
  cluster_status_fetched = False
4073
4053
  if refresh_cluster_status:
4074
4054
  try:
@@ -4096,6 +4076,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4096
4076
  f'Cluster {handle.cluster_name!r} is already terminated. '
4097
4077
  'Skipped.')
4098
4078
  return
4079
+
4080
+ if handle.cluster_yaml is None:
4081
+ logger.warning(f'Cluster {handle.cluster_name!r} has no '
4082
+ f'provision yaml so it '
4083
+ 'has not been provisioned. Skipped.')
4084
+ global_user_state.remove_cluster(handle.cluster_name,
4085
+ terminate=terminate)
4086
+ return
4099
4087
  log_path = os.path.join(os.path.expanduser(self.log_dir),
4100
4088
  'teardown.log')
4101
4089
  log_abs_path = os.path.abspath(log_path)
@@ -4150,7 +4138,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4150
4138
  raise
4151
4139
 
4152
4140
  if post_teardown_cleanup:
4153
- self.post_teardown_cleanup(handle, terminate, purge)
4141
+ self.post_teardown_cleanup(handle, terminate, purge,
4142
+ remove_from_db)
4154
4143
  return
4155
4144
 
4156
4145
  if (isinstance(cloud, clouds.IBM) and terminate and
@@ -4271,7 +4260,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4271
4260
  def post_teardown_cleanup(self,
4272
4261
  handle: CloudVmRayResourceHandle,
4273
4262
  terminate: bool,
4274
- purge: bool = False) -> None:
4263
+ purge: bool = False,
4264
+ remove_from_db: bool = True) -> None:
4275
4265
  """Cleanup local configs/caches and delete TPUs after teardown.
4276
4266
 
4277
4267
  This method will handle the following cleanup steps:
@@ -4302,96 +4292,100 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4302
4292
  'remove it manually to avoid image leakage. Details: '
4303
4293
  f'{common_utils.format_exception(e, use_bracket=True)}')
4304
4294
  if terminate:
4305
- config = common_utils.read_yaml(handle.cluster_yaml)
4306
- try:
4307
- cloud.check_features_are_supported(
4308
- handle.launched_resources,
4309
- {clouds.CloudImplementationFeatures.OPEN_PORTS})
4310
- provision_lib.cleanup_ports(repr(cloud), cluster_name_on_cloud,
4311
- handle.launched_resources.ports,
4312
- config['provider'])
4313
- except exceptions.NotSupportedError:
4314
- pass
4315
- except exceptions.PortDoesNotExistError:
4316
- logger.debug('Ports do not exist. Skipping cleanup.')
4317
- except Exception as e: # pylint: disable=broad-except
4318
- if purge:
4319
- logger.warning(
4320
- f'Failed to cleanup ports. Skipping since purge is '
4321
- f'set. Details: '
4322
- f'{common_utils.format_exception(e, use_bracket=True)}')
4323
- else:
4324
- raise
4295
+ # This function could be directly called from status refresh,
4296
+ # where we need to cleanup the cluster profile.
4297
+ metadata_utils.remove_cluster_metadata(handle.cluster_name)
4298
+ # The cluster yaml does not exist when skypilot has not found
4299
+ # the right resource to provision the cluster.
4300
+ if handle.cluster_yaml is not None:
4301
+ try:
4302
+ cloud = handle.launched_resources.cloud
4303
+ config = common_utils.read_yaml(handle.cluster_yaml)
4304
+ cloud.check_features_are_supported(
4305
+ handle.launched_resources,
4306
+ {clouds.CloudImplementationFeatures.OPEN_PORTS})
4307
+ provision_lib.cleanup_ports(repr(cloud),
4308
+ cluster_name_on_cloud,
4309
+ handle.launched_resources.ports,
4310
+ config['provider'])
4311
+ self.remove_cluster_config(handle)
4312
+ except exceptions.NotSupportedError:
4313
+ pass
4314
+ except exceptions.PortDoesNotExistError:
4315
+ logger.debug('Ports do not exist. Skipping cleanup.')
4316
+ except Exception as e: # pylint: disable=broad-except
4317
+ if purge:
4318
+ msg = common_utils.format_exception(e, use_bracket=True)
4319
+ logger.warning(
4320
+ f'Failed to cleanup ports. Skipping since purge is '
4321
+ f'set. Details: {msg}')
4322
+ else:
4323
+ raise
4325
4324
 
4326
- # The cluster file must exist because the cluster_yaml will only
4327
- # be removed after the cluster entry in the database is removed.
4328
- config = common_utils.read_yaml(handle.cluster_yaml)
4329
- backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
4330
-
4331
- # Confirm that instances have actually transitioned state before
4332
- # updating the state database. We do this immediately before removing
4333
- # the state from the database, so that we can guarantee that this is
4334
- # always called before the state is removed. We considered running this
4335
- # check as part of provisioner.teardown_cluster or
4336
- # provision.terminate_instances, but it would open the door code paths
4337
- # that successfully call this function but do not first call
4338
- # teardown_cluster or terminate_instances. See
4339
- # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
4340
- attempts = 0
4341
- while True:
4342
- logger.debug(f'instance statuses attempt {attempts + 1}')
4343
- try:
4325
+ sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
4326
+ handle.cluster_name)
4327
+
4328
+ def _detect_abnormal_non_terminated_nodes(
4329
+ handle: CloudVmRayResourceHandle) -> None:
4330
+ # Confirm that instances have actually transitioned state before
4331
+ # updating the state database. We do this immediately before
4332
+ # removing the state from the database, so that we can guarantee
4333
+ # that this is always called before the state is removed. We
4334
+ # considered running this check as part of
4335
+ # provisioner.teardown_cluster or provision.terminate_instances, but
4336
+ # it would open the door to code paths that successfully call this
4337
+ # function but do not first call teardown_cluster or
4338
+ # terminate_instances. See
4339
+ # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
4340
+ attempts = 0
4341
+ while True:
4342
+ config = common_utils.read_yaml(handle.cluster_yaml)
4343
+
4344
+ logger.debug(f'instance statuses attempt {attempts + 1}')
4344
4345
  node_status_dict = provision_lib.query_instances(
4345
4346
  repr(cloud),
4346
4347
  cluster_name_on_cloud,
4347
4348
  config['provider'],
4348
4349
  non_terminated_only=False)
4349
- except Exception as e: # pylint: disable=broad-except
4350
- if purge:
4351
- logger.warning(
4352
- f'Failed to query instances. Skipping since purge is '
4353
- f'set. Details: '
4354
- f'{common_utils.format_exception(e, use_bracket=True)}')
4355
- break
4356
- raise
4357
4350
 
4358
- unexpected_node_state: Optional[Tuple[str, str]] = None
4359
- for node_id, node_status in node_status_dict.items():
4360
- logger.debug(f'{node_id} status: {node_status}')
4361
- # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
4362
- # between "stopping/stopped" and "terminating/terminated", so we
4363
- # allow for either status instead of casing on `terminate`.
4364
- if node_status not in [None, status_lib.ClusterStatus.STOPPED]:
4365
- unexpected_node_state = (node_id, node_status)
4366
-
4367
- if unexpected_node_state is None:
4368
- break
4351
+ unexpected_node_state: Optional[Tuple[str, str]] = None
4352
+ for node_id, node_status in node_status_dict.items():
4353
+ logger.debug(f'{node_id} status: {node_status}')
4354
+ # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
4355
+ # between "stopping/stopped" and "terminating/terminated",
4356
+ # so we allow for either status instead of casing on
4357
+ # `terminate`.
4358
+ if node_status not in [
4359
+ None, status_lib.ClusterStatus.STOPPED
4360
+ ]:
4361
+ unexpected_node_state = (node_id, node_status)
4362
+ break
4369
4363
 
4370
- attempts += 1
4371
- if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
4372
- time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
4373
- else:
4374
- (node_id, node_status) = unexpected_node_state
4375
- if purge:
4376
- logger.warning(f'Instance {node_id} in unexpected '
4377
- f'state {node_status}. Skipping since purge '
4378
- 'is set.')
4364
+ if unexpected_node_state is None:
4379
4365
  break
4380
- raise RuntimeError(f'Instance {node_id} in unexpected '
4381
- f'state {node_status}.')
4382
4366
 
4383
- global_user_state.remove_cluster(handle.cluster_name,
4384
- terminate=terminate)
4367
+ attempts += 1
4368
+ if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
4369
+ time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
4370
+ else:
4371
+ (node_id, node_status) = unexpected_node_state
4372
+ raise RuntimeError(f'Instance {node_id} in unexpected '
4373
+ f'state {node_status}.')
4385
4374
 
4386
- if terminate:
4387
- # This function could be directly called from status refresh,
4388
- # where we need to cleanup the cluster profile.
4389
- metadata_utils.remove_cluster_metadata(handle.cluster_name)
4375
+ # If cluster_yaml is None, the cluster should ensured to be terminated,
4376
+ # so we don't need to do the double check.
4377
+ if handle.cluster_yaml is not None:
4378
+ _detect_abnormal_non_terminated_nodes(handle)
4390
4379
 
4391
- # Clean up generated config
4392
- # No try-except is needed since Ray will fail to teardown the
4393
- # cluster if the cluster_yaml is missing.
4394
- common_utils.remove_file_if_exists(handle.cluster_yaml)
4380
+ if not terminate or remove_from_db:
4381
+ global_user_state.remove_cluster(handle.cluster_name,
4382
+ terminate=terminate)
4383
+
4384
+ def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
4385
+ """Remove the YAML config of a cluster."""
4386
+ handle.cluster_yaml = None
4387
+ global_user_state.update_cluster_handle(handle.cluster_name, handle)
4388
+ common_utils.remove_file_if_exists(handle.cluster_yaml)
4395
4389
 
4396
4390
  def set_autostop(self,
4397
4391
  handle: CloudVmRayResourceHandle,
@@ -4468,7 +4462,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4468
4462
  stream_logs=stream_logs)
4469
4463
 
4470
4464
  if returncode == 0:
4471
- return common_utils.decode_payload(stdout)
4465
+ return message_utils.decode_payload(stdout)
4472
4466
  logger.debug('Failed to check if cluster is autostopping with '
4473
4467
  f'{returncode}: {stdout+stderr}\n'
4474
4468
  f'Command: {code}')
@@ -4707,7 +4701,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4707
4701
  if not data_utils.is_cloud_store_url(src):
4708
4702
  full_src = os.path.abspath(os.path.expanduser(src))
4709
4703
  # Checked during Task.set_file_mounts().
4710
- assert os.path.exists(full_src), f'{full_src} does not exist.'
4704
+ assert os.path.exists(
4705
+ full_src), f'{full_src} does not exist. {file_mounts}'
4711
4706
  src_size = backend_utils.path_size_megabytes(full_src)
4712
4707
  if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
4713
4708
  logger.warning(
@@ -4822,7 +4817,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4822
4817
  num_threads)
4823
4818
  end = time.time()
4824
4819
  logger.debug(f'File mount sync took {end - start} seconds.')
4825
- logger.info(ux_utils.finishing_message('Files synced.', log_path))
4820
+ logger.info(ux_utils.finishing_message('Synced file_mounts.', log_path))
4826
4821
 
4827
4822
  def _execute_storage_mounts(
4828
4823
  self, handle: CloudVmRayResourceHandle,
@@ -4858,6 +4853,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4858
4853
  f'Mounting {len(storage_mounts)} storage{plural}', log_path))
4859
4854
 
4860
4855
  for dst, storage_obj in storage_mounts.items():
4856
+ storage_obj.construct()
4861
4857
  if not os.path.isabs(dst) and not dst.startswith('~/'):
4862
4858
  dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
4863
4859
  # Raised when the bucket is externall removed before re-mounting
@@ -4871,6 +4867,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4871
4867
  'successfully without mounting the bucket.')
4872
4868
  # Get the first store and use it to mount
4873
4869
  store = list(storage_obj.stores.values())[0]
4870
+ assert store is not None, storage_obj
4874
4871
  mount_cmd = store.mount_command(dst)
4875
4872
  src_print = (storage_obj.source
4876
4873
  if storage_obj.source else storage_obj.name)
@@ -4925,6 +4922,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4925
4922
  return
4926
4923
  storage_mounts_metadata = {}
4927
4924
  for dst, storage_obj in storage_mounts.items():
4925
+ if storage_obj.mode != storage_lib.StorageMode.MOUNT:
4926
+ # Skip non-mount storage objects, as there is no need to
4927
+ # reconstruct them during cluster restart.
4928
+ continue
4928
4929
  storage_mounts_metadata[dst] = storage_obj.handle
4929
4930
  lock_path = (
4930
4931
  backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))