skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'e71e5a92ccd90a654662121d6f08c4e100377bbf'
8
+ _SKYPILOT_COMMIT_SHA = '11861fd35820ff0db76ecce1dc9a644db4ffb8f7'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250116'
38
+ __version__ = '1.0.0.dev20250118'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -10,6 +10,7 @@ import os
10
10
  import pathlib
11
11
  import re
12
12
  import shlex
13
+ import shutil
13
14
  import signal
14
15
  import subprocess
15
16
  import sys
@@ -44,6 +45,7 @@ from sky.clouds import service_catalog
44
45
  from sky.clouds.utils import gcp_utils
45
46
  from sky.data import data_utils
46
47
  from sky.data import storage as storage_lib
48
+ from sky.jobs import constants as managed_jobs_constants
47
49
  from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
@@ -154,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
154
156
  # might be added during ssh.
155
157
  _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
156
158
 
159
+ _RESOURCES_UNAVAILABLE_LOG = (
160
+ 'Reasons for provision failures (for details, please check the log above):')
161
+
157
162
 
158
163
  def _is_command_length_over_limit(command: str) -> bool:
159
164
  """Check if the length of the command exceeds the limit.
@@ -1996,6 +2001,7 @@ class RetryingVmProvisioner(object):
1996
2001
  skip_unnecessary_provisioning else None)
1997
2002
 
1998
2003
  failover_history: List[Exception] = list()
2004
+ resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
1999
2005
  # If the user is using local credentials which may expire, the
2000
2006
  # controller may leak resources if the credentials expire while a job
2001
2007
  # is running. Here we check the enabled clouds and expiring credentials
@@ -2087,6 +2093,8 @@ class RetryingVmProvisioner(object):
2087
2093
  # Add failed resources to the blocklist, only when it
2088
2094
  # is in fallback mode.
2089
2095
  _add_to_blocked_resources(self._blocked_resources, to_provision)
2096
+ assert len(failover_history) > 0
2097
+ resource_exceptions[to_provision] = failover_history[-1]
2090
2098
  else:
2091
2099
  # If we reach here, it means that the existing cluster must have
2092
2100
  # a previous status of INIT, because other statuses (UP,
@@ -2131,7 +2139,14 @@ class RetryingVmProvisioner(object):
2131
2139
  # possible resources or the requested resources is too
2132
2140
  # restrictive. If we reach here, our failover logic finally
2133
2141
  # ends here.
2134
- raise e.with_failover_history(failover_history)
2142
+ table = log_utils.create_table(['Resource', 'Reason'])
2143
+ for (resource, exception) in resource_exceptions.items():
2144
+ table.add_row(
2145
+ [resources_utils.format_resource(resource), exception])
2146
+ table.max_table_width = shutil.get_terminal_size().columns
2147
+ raise exceptions.ResourcesUnavailableError(
2148
+ _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
2149
+ failover_history=failover_history)
2135
2150
  to_provision = task.best_resources
2136
2151
  assert task in self._dag.tasks, 'Internal logic error.'
2137
2152
  assert to_provision is not None, task
@@ -2894,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2894
2909
  'the `--retry-until-up` flag.')
2895
2910
  with ux_utils.print_exception_no_traceback():
2896
2911
  raise exceptions.ResourcesUnavailableError(
2897
- error_message,
2912
+ error_message + '\n' + str(e),
2898
2913
  failover_history=e.failover_history) from None
2899
2914
  if dryrun:
2900
2915
  record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3909,40 +3924,45 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3909
3924
  Returns:
3910
3925
  A dictionary mapping job_id to log path.
3911
3926
  """
3912
- # if job_name is not None, job_id should be None
3927
+ # if job_name and job_id should not both be specified
3913
3928
  assert job_name is None or job_id is None, (job_name, job_id)
3914
- if job_id is None and job_name is not None:
3929
+
3930
+ if job_id is None:
3915
3931
  # generate code to get the job_id
3932
+ # if job_name is None, get all job_ids
3933
+ # TODO: Only get the latest job_id, since that's the only one we use
3916
3934
  code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
3917
3935
  job_name=job_name)
3918
- returncode, run_timestamps, stderr = self.run_on_head(
3919
- handle,
3920
- code,
3921
- stream_logs=False,
3922
- require_outputs=True,
3923
- separate_stderr=True)
3936
+ returncode, job_ids, stderr = self.run_on_head(handle,
3937
+ code,
3938
+ stream_logs=False,
3939
+ require_outputs=True,
3940
+ separate_stderr=True)
3924
3941
  subprocess_utils.handle_returncode(returncode, code,
3925
3942
  'Failed to sync down logs.',
3926
3943
  stderr)
3927
- job_ids = common_utils.decode_payload(run_timestamps)
3944
+ job_ids = common_utils.decode_payload(job_ids)
3928
3945
  if not job_ids:
3929
3946
  logger.info(f'{colorama.Fore.YELLOW}'
3930
3947
  'No matching job found'
3931
3948
  f'{colorama.Style.RESET_ALL}')
3932
3949
  return {}
3933
3950
  elif len(job_ids) > 1:
3934
- logger.info(
3935
- f'{colorama.Fore.YELLOW}'
3936
- f'Multiple jobs IDs found under the name {job_name}. '
3937
- 'Downloading the latest job logs.'
3938
- f'{colorama.Style.RESET_ALL}')
3939
- job_ids = [job_ids[0]] # descending order
3940
- else:
3941
- job_ids = [job_id]
3951
+ name_str = ''
3952
+ if job_name is not None:
3953
+ name_str = ('Multiple jobs IDs found under the name '
3954
+ f'{job_name}. ')
3955
+ logger.info(f'{colorama.Fore.YELLOW}'
3956
+ f'{name_str}'
3957
+ 'Downloading the latest job logs.'
3958
+ f'{colorama.Style.RESET_ALL}')
3959
+ # list should aready be in descending order
3960
+ job_id = job_ids[0]
3942
3961
 
3943
3962
  # get the run_timestamp
3944
3963
  # the function takes in [job_id]
3945
- code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(job_ids)
3964
+ code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
3965
+ [str(job_id)])
3946
3966
  returncode, run_timestamps, stderr = self.run_on_head(
3947
3967
  handle,
3948
3968
  code,
@@ -3963,13 +3983,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3963
3983
  job_id = list(run_timestamps.keys())[0]
3964
3984
  local_log_dir = ''
3965
3985
  if controller: # download controller logs
3966
- remote_log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
3967
- run_timestamp)
3986
+ remote_log = os.path.join(
3987
+ managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
3988
+ f'{job_id}.log')
3968
3989
  local_log_dir = os.path.expanduser(
3969
3990
  os.path.join(local_dir, run_timestamp))
3970
3991
 
3971
3992
  logger.info(f'{colorama.Fore.CYAN}'
3972
- f'Job {job_ids} local logs: {local_log_dir}'
3993
+ f'Job {job_id} local logs: {local_log_dir}'
3973
3994
  f'{colorama.Style.RESET_ALL}')
3974
3995
 
3975
3996
  runners = handle.get_command_runners()
@@ -3980,12 +4001,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3980
4001
  Args:
3981
4002
  args: A tuple of (runner, local_log_dir, remote_log_dir)
3982
4003
  """
3983
- (runner, local_log_dir, remote_log_dir) = args
4004
+ (runner, local_log_dir, remote_log) = args
3984
4005
  try:
3985
4006
  os.makedirs(local_log_dir, exist_ok=True)
3986
4007
  runner.rsync(
3987
- source=f'{remote_log_dir}/',
3988
- target=local_log_dir,
4008
+ source=remote_log,
4009
+ target=f'{local_log_dir}/controller.log',
3989
4010
  up=False,
3990
4011
  stream_logs=False,
3991
4012
  )
@@ -3998,9 +4019,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3998
4019
  else:
3999
4020
  raise
4000
4021
 
4001
- parallel_args = [[runner, *item]
4002
- for item in zip([local_log_dir], [remote_log_dir])
4003
- for runner in runners]
4022
+ parallel_args = [
4023
+ (runner, local_log_dir, remote_log) for runner in runners
4024
+ ]
4004
4025
  subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
4005
4026
  else: # download job logs
4006
4027
  local_log_dir = os.path.expanduser(
sky/cli.py CHANGED
@@ -3530,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3530
3530
  if sum([bool(names), all]) != 1:
3531
3531
  raise click.UsageError('Either --all or a name must be specified.')
3532
3532
  if all:
3533
- storages = sky.storage_ls()
3534
- if not storages:
3533
+ # Use '*' to get all storages.
3534
+ names = global_user_state.get_glob_storage_name(storage_name='*')
3535
+ if not names:
3535
3536
  click.echo('No storage(s) to delete.')
3536
3537
  return
3537
- names = [s['name'] for s in storages]
3538
3538
  else:
3539
3539
  names = _get_glob_storages(names)
3540
3540
  if names:
@@ -3548,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3548
3548
  abort=True,
3549
3549
  show_default=True)
3550
3550
 
3551
- subprocess_utils.run_in_parallel(sky.storage_delete, names)
3551
+ def delete_storage(name: str) -> None:
3552
+ try:
3553
+ sky.storage_delete(name)
3554
+ except Exception as e: # pylint: disable=broad-except
3555
+ click.secho(f'Error deleting storage {name}: {e}', fg='red')
3556
+
3557
+ subprocess_utils.run_in_parallel(delete_storage, names)
3552
3558
 
3553
3559
 
3554
3560
  @cli.group(cls=_NaturalOrderGroup)
@@ -3588,18 +3594,6 @@ def jobs():
3588
3594
  is_flag=True,
3589
3595
  help=('If True, as soon as a job is submitted, return from this call '
3590
3596
  'and do not stream execution logs.'))
3591
- @click.option(
3592
- '--retry-until-up/--no-retry-until-up',
3593
- '-r/-no-r',
3594
- default=None,
3595
- is_flag=True,
3596
- required=False,
3597
- help=(
3598
- '(Default: True; this flag is deprecated and will be removed in a '
3599
- 'future release.) Whether to retry provisioning infinitely until the '
3600
- 'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
3601
- 'applies to launching all managed jobs (both the initial and '
3602
- 'any recovery attempts), not the jobs controller.'))
3603
3597
  @click.option('--yes',
3604
3598
  '-y',
3605
3599
  is_flag=True,
@@ -3636,7 +3630,6 @@ def jobs_launch(
3636
3630
  disk_tier: Optional[str],
3637
3631
  ports: Tuple[str],
3638
3632
  detach_run: bool,
3639
- retry_until_up: Optional[bool],
3640
3633
  yes: bool,
3641
3634
  fast: bool,
3642
3635
  ):
@@ -3680,19 +3673,6 @@ def jobs_launch(
3680
3673
  ports=ports,
3681
3674
  job_recovery=job_recovery,
3682
3675
  )
3683
- # Deprecation. We set the default behavior to be retry until up, and the
3684
- # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
3685
- if retry_until_up is not None:
3686
- flag_str = '--retry-until-up'
3687
- if not retry_until_up:
3688
- flag_str = '--no-retry-until-up'
3689
- click.secho(
3690
- f'Flag {flag_str} is deprecated and will be removed in a '
3691
- 'future release (managed jobs will always be retried). '
3692
- 'Please file an issue if this does not work for you.',
3693
- fg='yellow')
3694
- else:
3695
- retry_until_up = True
3696
3676
 
3697
3677
  # Deprecation. The default behavior is fast, and the flag will be removed.
3698
3678
  # The flag was not present in 0.7.x (only nightly), so we will remove before
@@ -3742,10 +3722,7 @@ def jobs_launch(
3742
3722
 
3743
3723
  common_utils.check_cluster_name_is_valid(name)
3744
3724
 
3745
- managed_jobs.launch(dag,
3746
- name,
3747
- detach_run=detach_run,
3748
- retry_until_up=retry_until_up)
3725
+ managed_jobs.launch(dag, name, detach_run=detach_run)
3749
3726
 
3750
3727
 
3751
3728
  @jobs.command('queue', cls=_DocumentedCodeCommand)
sky/core.py CHANGED
@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
915
915
  handle = global_user_state.get_handle_from_storage_name(name)
916
916
  if handle is None:
917
917
  raise ValueError(f'Storage name {name!r} not found.')
918
- else:
919
- storage_object = data.Storage(name=handle.storage_name,
920
- source=handle.source,
921
- sync_on_reconstruction=False)
922
- storage_object.delete()
918
+
919
+ assert handle.storage_name == name, (
920
+ f'In global_user_state, storage name {name!r} does not match '
921
+ f'handle.storage_name {handle.storage_name!r}')
922
+ storage_object = data.Storage(name=handle.storage_name,
923
+ source=handle.source,
924
+ sync_on_reconstruction=False)
925
+ storage_object.delete()
sky/data/storage.py CHANGED
@@ -1083,18 +1083,16 @@ class Storage(object):
1083
1083
  if not self.stores:
1084
1084
  logger.info('No backing stores found. Deleting storage.')
1085
1085
  global_user_state.remove_storage(self.name)
1086
- if store_type:
1086
+ if store_type is not None:
1087
1087
  store = self.stores[store_type]
1088
- is_sky_managed = store.is_sky_managed
1089
1088
  # We delete a store from the cloud if it's sky managed. Else just
1090
1089
  # remove handle and return
1091
- if is_sky_managed:
1090
+ if store.is_sky_managed:
1092
1091
  self.handle.remove_store(store)
1093
1092
  store.delete()
1094
1093
  # Check remaining stores - if none is sky managed, remove
1095
1094
  # the storage from global_user_state.
1096
- delete = all(
1097
- s.is_sky_managed is False for s in self.stores.values())
1095
+ delete = all(not s.is_sky_managed for s in self.stores.values())
1098
1096
  if delete:
1099
1097
  global_user_state.remove_storage(self.name)
1100
1098
  else:
@@ -1689,6 +1687,9 @@ class S3Store(AbstractStore):
1689
1687
 
1690
1688
  Returns:
1691
1689
  bool; True if bucket was deleted, False if it was deleted externally.
1690
+
1691
+ Raises:
1692
+ StorageBucketDeleteError: If deleting the bucket fails.
1692
1693
  """
1693
1694
  # Deleting objects is very slow programatically
1694
1695
  # (i.e. bucket.objects.all().delete() is slow).
@@ -2179,6 +2180,11 @@ class GcsStore(AbstractStore):
2179
2180
 
2180
2181
  Returns:
2181
2182
  bool; True if bucket was deleted, False if it was deleted externally.
2183
+
2184
+ Raises:
2185
+ StorageBucketDeleteError: If deleting the bucket fails.
2186
+ PermissionError: If the bucket is external and the user is not
2187
+ allowed to delete it.
2182
2188
  """
2183
2189
  if _bucket_sub_path is not None:
2184
2190
  command_suffix = f'/{_bucket_sub_path}'
@@ -3478,6 +3484,9 @@ class R2Store(AbstractStore):
3478
3484
 
3479
3485
  Returns:
3480
3486
  bool; True if bucket was deleted, False if it was deleted externally.
3487
+
3488
+ Raises:
3489
+ StorageBucketDeleteError: If deleting the bucket fails.
3481
3490
  """
3482
3491
  # Deleting objects is very slow programatically
3483
3492
  # (i.e. bucket.objects.all().delete() is slow).
@@ -3932,7 +3941,7 @@ class IBMCosStore(AbstractStore):
3932
3941
 
3933
3942
  def _delete_cos_bucket_objects(self,
3934
3943
  bucket: Any,
3935
- prefix: Optional[str] = None):
3944
+ prefix: Optional[str] = None) -> None:
3936
3945
  bucket_versioning = self.s3_resource.BucketVersioning(bucket.name)
3937
3946
  if bucket_versioning.status == 'Enabled':
3938
3947
  if prefix is not None:
@@ -3947,7 +3956,7 @@ class IBMCosStore(AbstractStore):
3947
3956
  res = list(bucket.objects.delete())
3948
3957
  logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}')
3949
3958
 
3950
- def _delete_cos_bucket(self):
3959
+ def _delete_cos_bucket(self) -> None:
3951
3960
  bucket = self.s3_resource.Bucket(self.name)
3952
3961
  try:
3953
3962
  self._delete_cos_bucket_objects(bucket)
sky/global_user_state.py CHANGED
@@ -827,7 +827,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
827
827
 
828
828
 
829
829
  def get_storage() -> List[Dict[str, Any]]:
830
- rows = _DB.cursor.execute('select * from storage')
830
+ rows = _DB.cursor.execute('SELECT * FROM storage')
831
831
  records = []
832
832
  for name, launched_at, handle, last_use, status in rows:
833
833
  # TODO: use namedtuple instead of dict
sky/jobs/constants.py CHANGED
@@ -2,18 +2,19 @@
2
2
 
3
3
  JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
4
4
  JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
5
+ JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
5
6
 
6
7
  JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
7
8
 
8
9
  # Resources as a dict for the jobs controller.
9
- # Use default CPU instance type for jobs controller with >= 24GB, i.e.
10
- # m6i.2xlarge (8vCPUs, 32 GB) for AWS, Standard_D8s_v4 (8vCPUs, 32 GB)
11
- # for Azure, and n1-standard-8 (8 vCPUs, 32 GB) for GCP, etc.
12
- # Based on profiling, memory should be at least 3x (in GB) as num vCPUs to avoid
13
- # OOM (each vCPU can have 4 jobs controller processes as we set the CPU
14
- # requirement to 0.25, and 3 GB is barely enough for 4 job processes).
10
+ # Use smaller CPU instance type for jobs controller, but with more memory, i.e.
11
+ # r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
12
+ # and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
13
+ # Concurrently limits are set based on profiling. 4x num vCPUs is the launch
14
+ # parallelism limit, and memory / 350MB is the limit to concurrently running
15
+ # jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
15
16
  # We use 50 GB disk size to reduce the cost.
16
- CONTROLLER_RESOURCES = {'cpus': '8+', 'memory': '3x', 'disk_size': 50}
17
+ CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
17
18
 
18
19
  # Max length of the cluster name for GCP is 35, the user hash to be attached is
19
20
  # 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
sky/jobs/controller.py CHANGED
@@ -16,6 +16,7 @@ from sky import status_lib
16
16
  from sky.backends import backend_utils
17
17
  from sky.backends import cloud_vm_ray_backend
18
18
  from sky.jobs import recovery_strategy
19
+ from sky.jobs import scheduler
19
20
  from sky.jobs import state as managed_job_state
20
21
  from sky.jobs import utils as managed_job_utils
21
22
  from sky.skylet import constants
@@ -46,12 +47,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
46
47
  class JobsController:
47
48
  """Each jobs controller manages the life cycle of one managed job."""
48
49
 
49
- def __init__(self, job_id: int, dag_yaml: str,
50
- retry_until_up: bool) -> None:
50
+ def __init__(self, job_id: int, dag_yaml: str) -> None:
51
51
  self._job_id = job_id
52
52
  self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
53
53
  logger.info(self._dag)
54
- self._retry_until_up = retry_until_up
55
54
  # TODO(zhwu): this assumes the specific backend.
56
55
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
57
56
 
@@ -174,7 +173,7 @@ class JobsController:
174
173
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
175
174
  task.name, self._job_id)
176
175
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
177
- cluster_name, self._backend, task, self._retry_until_up)
176
+ cluster_name, self._backend, task, self._job_id)
178
177
  managed_job_state.set_submitted(
179
178
  self._job_id,
180
179
  task_id,
@@ -202,6 +201,7 @@ class JobsController:
202
201
  task_id=task_id,
203
202
  start_time=remote_job_submitted_at,
204
203
  callback_func=callback_func)
204
+
205
205
  while True:
206
206
  time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
207
207
 
@@ -243,7 +243,7 @@ class JobsController:
243
243
  self._download_log_and_stream(task_id, handle)
244
244
  # Only clean up the cluster, not the storages, because tasks may
245
245
  # share storages.
246
- recovery_strategy.terminate_cluster(cluster_name=cluster_name)
246
+ managed_job_utils.terminate_cluster(cluster_name=cluster_name)
247
247
  return True
248
248
 
249
249
  # For single-node jobs, non-terminated job_status indicates a
@@ -342,7 +342,7 @@ class JobsController:
342
342
  # those clusters again may fail.
343
343
  logger.info('Cleaning up the preempted or failed cluster'
344
344
  '...')
345
- recovery_strategy.terminate_cluster(cluster_name)
345
+ managed_job_utils.terminate_cluster(cluster_name)
346
346
 
347
347
  # Try to recover the managed jobs, when the cluster is preempted or
348
348
  # failed or the job status is failed to be fetched.
@@ -424,11 +424,11 @@ class JobsController:
424
424
  task=self._dag.tasks[task_id]))
425
425
 
426
426
 
427
- def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
427
+ def _run_controller(job_id: int, dag_yaml: str):
428
428
  """Runs the controller in a remote process for interruption."""
429
429
  # The controller needs to be instantiated in the remote process, since
430
430
  # the controller is not serializable.
431
- jobs_controller = JobsController(job_id, dag_yaml, retry_until_up)
431
+ jobs_controller = JobsController(job_id, dag_yaml)
432
432
  jobs_controller.run()
433
433
 
434
434
 
@@ -478,14 +478,14 @@ def _cleanup(job_id: int, dag_yaml: str):
478
478
  assert task.name is not None, task
479
479
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
480
480
  task.name, job_id)
481
- recovery_strategy.terminate_cluster(cluster_name)
481
+ managed_job_utils.terminate_cluster(cluster_name)
482
482
  # Clean up Storages with persistent=False.
483
483
  # TODO(zhwu): this assumes the specific backend.
484
484
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
485
485
  backend.teardown_ephemeral_storage(task)
486
486
 
487
487
 
488
- def start(job_id, dag_yaml, retry_until_up):
488
+ def start(job_id, dag_yaml):
489
489
  """Start the controller."""
490
490
  controller_process = None
491
491
  cancelling = False
@@ -499,8 +499,7 @@ def start(job_id, dag_yaml, retry_until_up):
499
499
  # So we can only enable daemon after we no longer need to
500
500
  # start daemon processes like Ray.
501
501
  controller_process = multiprocessing.Process(target=_run_controller,
502
- args=(job_id, dag_yaml,
503
- retry_until_up))
502
+ args=(job_id, dag_yaml))
504
503
  controller_process.start()
505
504
  while controller_process.is_alive():
506
505
  _handle_signal(job_id)
@@ -562,6 +561,8 @@ def start(job_id, dag_yaml, retry_until_up):
562
561
  failure_reason=('Unexpected error occurred. For details, '
563
562
  f'run: sky jobs logs --controller {job_id}'))
564
563
 
564
+ scheduler.job_done(job_id)
565
+
565
566
 
566
567
  if __name__ == '__main__':
567
568
  parser = argparse.ArgumentParser()
@@ -569,9 +570,6 @@ if __name__ == '__main__':
569
570
  required=True,
570
571
  type=int,
571
572
  help='Job id for the controller job.')
572
- parser.add_argument('--retry-until-up',
573
- action='store_true',
574
- help='Retry until the cluster is up.')
575
573
  parser.add_argument('dag_yaml',
576
574
  type=str,
577
575
  help='The path to the user job yaml file.')
@@ -579,4 +577,4 @@ if __name__ == '__main__':
579
577
  # We start process with 'spawn', because 'fork' could result in weird
580
578
  # behaviors; 'spawn' is also cross-platform.
581
579
  multiprocessing.set_start_method('spawn', force=True)
582
- start(args.job_id, args.dag_yaml, args.retry_until_up)
580
+ start(args.job_id, args.dag_yaml)
sky/jobs/core.py CHANGED
@@ -41,7 +41,6 @@ def launch(
41
41
  name: Optional[str] = None,
42
42
  stream_logs: bool = True,
43
43
  detach_run: bool = False,
44
- retry_until_up: bool = False,
45
44
  # TODO(cooperc): remove fast arg before 0.8.0
46
45
  fast: bool = True, # pylint: disable=unused-argument for compatibility
47
46
  ) -> None:
@@ -115,7 +114,6 @@ def launch(
115
114
  'jobs_controller': controller_name,
116
115
  # Note: actual cluster name will be <task.name>-<managed job ID>
117
116
  'dag_name': dag.name,
118
- 'retry_until_up': retry_until_up,
119
117
  'remote_user_config_path': remote_user_config_path,
120
118
  'modified_catalogs':
121
119
  service_catalog_common.get_modified_catalog_file_mounts(),