skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250118__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -29
- sky/cli.py +11 -34
- sky/core.py +8 -5
- sky/data/storage.py +16 -7
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +14 -16
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +257 -17
- sky/jobs/utils.py +287 -64
- sky/provision/kubernetes/instance.py +1 -1
- sky/resources.py +1 -1
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +2 -26
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/utils/resources_utils.py +25 -21
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/RECORD +30 -29
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250118.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '11861fd35820ff0db76ecce1dc9a644db4ffb8f7'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250118'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -10,6 +10,7 @@ import os
|
|
10
10
|
import pathlib
|
11
11
|
import re
|
12
12
|
import shlex
|
13
|
+
import shutil
|
13
14
|
import signal
|
14
15
|
import subprocess
|
15
16
|
import sys
|
@@ -44,6 +45,7 @@ from sky.clouds import service_catalog
|
|
44
45
|
from sky.clouds.utils import gcp_utils
|
45
46
|
from sky.data import data_utils
|
46
47
|
from sky.data import storage as storage_lib
|
48
|
+
from sky.jobs import constants as managed_jobs_constants
|
47
49
|
from sky.provision import common as provision_common
|
48
50
|
from sky.provision import instance_setup
|
49
51
|
from sky.provision import metadata_utils
|
@@ -154,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
154
156
|
# might be added during ssh.
|
155
157
|
_MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
|
156
158
|
|
159
|
+
_RESOURCES_UNAVAILABLE_LOG = (
|
160
|
+
'Reasons for provision failures (for details, please check the log above):')
|
161
|
+
|
157
162
|
|
158
163
|
def _is_command_length_over_limit(command: str) -> bool:
|
159
164
|
"""Check if the length of the command exceeds the limit.
|
@@ -1996,6 +2001,7 @@ class RetryingVmProvisioner(object):
|
|
1996
2001
|
skip_unnecessary_provisioning else None)
|
1997
2002
|
|
1998
2003
|
failover_history: List[Exception] = list()
|
2004
|
+
resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
|
1999
2005
|
# If the user is using local credentials which may expire, the
|
2000
2006
|
# controller may leak resources if the credentials expire while a job
|
2001
2007
|
# is running. Here we check the enabled clouds and expiring credentials
|
@@ -2087,6 +2093,8 @@ class RetryingVmProvisioner(object):
|
|
2087
2093
|
# Add failed resources to the blocklist, only when it
|
2088
2094
|
# is in fallback mode.
|
2089
2095
|
_add_to_blocked_resources(self._blocked_resources, to_provision)
|
2096
|
+
assert len(failover_history) > 0
|
2097
|
+
resource_exceptions[to_provision] = failover_history[-1]
|
2090
2098
|
else:
|
2091
2099
|
# If we reach here, it means that the existing cluster must have
|
2092
2100
|
# a previous status of INIT, because other statuses (UP,
|
@@ -2131,7 +2139,14 @@ class RetryingVmProvisioner(object):
|
|
2131
2139
|
# possible resources or the requested resources is too
|
2132
2140
|
# restrictive. If we reach here, our failover logic finally
|
2133
2141
|
# ends here.
|
2134
|
-
|
2142
|
+
table = log_utils.create_table(['Resource', 'Reason'])
|
2143
|
+
for (resource, exception) in resource_exceptions.items():
|
2144
|
+
table.add_row(
|
2145
|
+
[resources_utils.format_resource(resource), exception])
|
2146
|
+
table.max_table_width = shutil.get_terminal_size().columns
|
2147
|
+
raise exceptions.ResourcesUnavailableError(
|
2148
|
+
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
2149
|
+
failover_history=failover_history)
|
2135
2150
|
to_provision = task.best_resources
|
2136
2151
|
assert task in self._dag.tasks, 'Internal logic error.'
|
2137
2152
|
assert to_provision is not None, task
|
@@ -2894,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2894
2909
|
'the `--retry-until-up` flag.')
|
2895
2910
|
with ux_utils.print_exception_no_traceback():
|
2896
2911
|
raise exceptions.ResourcesUnavailableError(
|
2897
|
-
error_message,
|
2912
|
+
error_message + '\n' + str(e),
|
2898
2913
|
failover_history=e.failover_history) from None
|
2899
2914
|
if dryrun:
|
2900
2915
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
@@ -3909,40 +3924,45 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3909
3924
|
Returns:
|
3910
3925
|
A dictionary mapping job_id to log path.
|
3911
3926
|
"""
|
3912
|
-
# if job_name
|
3927
|
+
# if job_name and job_id should not both be specified
|
3913
3928
|
assert job_name is None or job_id is None, (job_name, job_id)
|
3914
|
-
|
3929
|
+
|
3930
|
+
if job_id is None:
|
3915
3931
|
# generate code to get the job_id
|
3932
|
+
# if job_name is None, get all job_ids
|
3933
|
+
# TODO: Only get the latest job_id, since that's the only one we use
|
3916
3934
|
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
3917
3935
|
job_name=job_name)
|
3918
|
-
returncode,
|
3919
|
-
|
3920
|
-
|
3921
|
-
|
3922
|
-
|
3923
|
-
separate_stderr=True)
|
3936
|
+
returncode, job_ids, stderr = self.run_on_head(handle,
|
3937
|
+
code,
|
3938
|
+
stream_logs=False,
|
3939
|
+
require_outputs=True,
|
3940
|
+
separate_stderr=True)
|
3924
3941
|
subprocess_utils.handle_returncode(returncode, code,
|
3925
3942
|
'Failed to sync down logs.',
|
3926
3943
|
stderr)
|
3927
|
-
job_ids = common_utils.decode_payload(
|
3944
|
+
job_ids = common_utils.decode_payload(job_ids)
|
3928
3945
|
if not job_ids:
|
3929
3946
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3930
3947
|
'No matching job found'
|
3931
3948
|
f'{colorama.Style.RESET_ALL}')
|
3932
3949
|
return {}
|
3933
3950
|
elif len(job_ids) > 1:
|
3934
|
-
|
3935
|
-
|
3936
|
-
|
3937
|
-
|
3938
|
-
|
3939
|
-
|
3940
|
-
|
3941
|
-
|
3951
|
+
name_str = ''
|
3952
|
+
if job_name is not None:
|
3953
|
+
name_str = ('Multiple jobs IDs found under the name '
|
3954
|
+
f'{job_name}. ')
|
3955
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3956
|
+
f'{name_str}'
|
3957
|
+
'Downloading the latest job logs.'
|
3958
|
+
f'{colorama.Style.RESET_ALL}')
|
3959
|
+
# list should aready be in descending order
|
3960
|
+
job_id = job_ids[0]
|
3942
3961
|
|
3943
3962
|
# get the run_timestamp
|
3944
3963
|
# the function takes in [job_id]
|
3945
|
-
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3964
|
+
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3965
|
+
[str(job_id)])
|
3946
3966
|
returncode, run_timestamps, stderr = self.run_on_head(
|
3947
3967
|
handle,
|
3948
3968
|
code,
|
@@ -3963,13 +3983,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3963
3983
|
job_id = list(run_timestamps.keys())[0]
|
3964
3984
|
local_log_dir = ''
|
3965
3985
|
if controller: # download controller logs
|
3966
|
-
|
3967
|
-
|
3986
|
+
remote_log = os.path.join(
|
3987
|
+
managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
|
3988
|
+
f'{job_id}.log')
|
3968
3989
|
local_log_dir = os.path.expanduser(
|
3969
3990
|
os.path.join(local_dir, run_timestamp))
|
3970
3991
|
|
3971
3992
|
logger.info(f'{colorama.Fore.CYAN}'
|
3972
|
-
f'Job {
|
3993
|
+
f'Job {job_id} local logs: {local_log_dir}'
|
3973
3994
|
f'{colorama.Style.RESET_ALL}')
|
3974
3995
|
|
3975
3996
|
runners = handle.get_command_runners()
|
@@ -3980,12 +4001,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3980
4001
|
Args:
|
3981
4002
|
args: A tuple of (runner, local_log_dir, remote_log_dir)
|
3982
4003
|
"""
|
3983
|
-
(runner, local_log_dir,
|
4004
|
+
(runner, local_log_dir, remote_log) = args
|
3984
4005
|
try:
|
3985
4006
|
os.makedirs(local_log_dir, exist_ok=True)
|
3986
4007
|
runner.rsync(
|
3987
|
-
source=
|
3988
|
-
target=local_log_dir,
|
4008
|
+
source=remote_log,
|
4009
|
+
target=f'{local_log_dir}/controller.log',
|
3989
4010
|
up=False,
|
3990
4011
|
stream_logs=False,
|
3991
4012
|
)
|
@@ -3998,9 +4019,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3998
4019
|
else:
|
3999
4020
|
raise
|
4000
4021
|
|
4001
|
-
parallel_args = [
|
4002
|
-
|
4003
|
-
|
4022
|
+
parallel_args = [
|
4023
|
+
(runner, local_log_dir, remote_log) for runner in runners
|
4024
|
+
]
|
4004
4025
|
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
4005
4026
|
else: # download job logs
|
4006
4027
|
local_log_dir = os.path.expanduser(
|
sky/cli.py
CHANGED
@@ -3530,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3530
3530
|
if sum([bool(names), all]) != 1:
|
3531
3531
|
raise click.UsageError('Either --all or a name must be specified.')
|
3532
3532
|
if all:
|
3533
|
-
|
3534
|
-
|
3533
|
+
# Use '*' to get all storages.
|
3534
|
+
names = global_user_state.get_glob_storage_name(storage_name='*')
|
3535
|
+
if not names:
|
3535
3536
|
click.echo('No storage(s) to delete.')
|
3536
3537
|
return
|
3537
|
-
names = [s['name'] for s in storages]
|
3538
3538
|
else:
|
3539
3539
|
names = _get_glob_storages(names)
|
3540
3540
|
if names:
|
@@ -3548,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3548
3548
|
abort=True,
|
3549
3549
|
show_default=True)
|
3550
3550
|
|
3551
|
-
|
3551
|
+
def delete_storage(name: str) -> None:
|
3552
|
+
try:
|
3553
|
+
sky.storage_delete(name)
|
3554
|
+
except Exception as e: # pylint: disable=broad-except
|
3555
|
+
click.secho(f'Error deleting storage {name}: {e}', fg='red')
|
3556
|
+
|
3557
|
+
subprocess_utils.run_in_parallel(delete_storage, names)
|
3552
3558
|
|
3553
3559
|
|
3554
3560
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -3588,18 +3594,6 @@ def jobs():
|
|
3588
3594
|
is_flag=True,
|
3589
3595
|
help=('If True, as soon as a job is submitted, return from this call '
|
3590
3596
|
'and do not stream execution logs.'))
|
3591
|
-
@click.option(
|
3592
|
-
'--retry-until-up/--no-retry-until-up',
|
3593
|
-
'-r/-no-r',
|
3594
|
-
default=None,
|
3595
|
-
is_flag=True,
|
3596
|
-
required=False,
|
3597
|
-
help=(
|
3598
|
-
'(Default: True; this flag is deprecated and will be removed in a '
|
3599
|
-
'future release.) Whether to retry provisioning infinitely until the '
|
3600
|
-
'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
|
3601
|
-
'applies to launching all managed jobs (both the initial and '
|
3602
|
-
'any recovery attempts), not the jobs controller.'))
|
3603
3597
|
@click.option('--yes',
|
3604
3598
|
'-y',
|
3605
3599
|
is_flag=True,
|
@@ -3636,7 +3630,6 @@ def jobs_launch(
|
|
3636
3630
|
disk_tier: Optional[str],
|
3637
3631
|
ports: Tuple[str],
|
3638
3632
|
detach_run: bool,
|
3639
|
-
retry_until_up: Optional[bool],
|
3640
3633
|
yes: bool,
|
3641
3634
|
fast: bool,
|
3642
3635
|
):
|
@@ -3680,19 +3673,6 @@ def jobs_launch(
|
|
3680
3673
|
ports=ports,
|
3681
3674
|
job_recovery=job_recovery,
|
3682
3675
|
)
|
3683
|
-
# Deprecation. We set the default behavior to be retry until up, and the
|
3684
|
-
# flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
|
3685
|
-
if retry_until_up is not None:
|
3686
|
-
flag_str = '--retry-until-up'
|
3687
|
-
if not retry_until_up:
|
3688
|
-
flag_str = '--no-retry-until-up'
|
3689
|
-
click.secho(
|
3690
|
-
f'Flag {flag_str} is deprecated and will be removed in a '
|
3691
|
-
'future release (managed jobs will always be retried). '
|
3692
|
-
'Please file an issue if this does not work for you.',
|
3693
|
-
fg='yellow')
|
3694
|
-
else:
|
3695
|
-
retry_until_up = True
|
3696
3676
|
|
3697
3677
|
# Deprecation. The default behavior is fast, and the flag will be removed.
|
3698
3678
|
# The flag was not present in 0.7.x (only nightly), so we will remove before
|
@@ -3742,10 +3722,7 @@ def jobs_launch(
|
|
3742
3722
|
|
3743
3723
|
common_utils.check_cluster_name_is_valid(name)
|
3744
3724
|
|
3745
|
-
managed_jobs.launch(dag,
|
3746
|
-
name,
|
3747
|
-
detach_run=detach_run,
|
3748
|
-
retry_until_up=retry_until_up)
|
3725
|
+
managed_jobs.launch(dag, name, detach_run=detach_run)
|
3749
3726
|
|
3750
3727
|
|
3751
3728
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
sky/core.py
CHANGED
@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
|
|
915
915
|
handle = global_user_state.get_handle_from_storage_name(name)
|
916
916
|
if handle is None:
|
917
917
|
raise ValueError(f'Storage name {name!r} not found.')
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
918
|
+
|
919
|
+
assert handle.storage_name == name, (
|
920
|
+
f'In global_user_state, storage name {name!r} does not match '
|
921
|
+
f'handle.storage_name {handle.storage_name!r}')
|
922
|
+
storage_object = data.Storage(name=handle.storage_name,
|
923
|
+
source=handle.source,
|
924
|
+
sync_on_reconstruction=False)
|
925
|
+
storage_object.delete()
|
sky/data/storage.py
CHANGED
@@ -1083,18 +1083,16 @@ class Storage(object):
|
|
1083
1083
|
if not self.stores:
|
1084
1084
|
logger.info('No backing stores found. Deleting storage.')
|
1085
1085
|
global_user_state.remove_storage(self.name)
|
1086
|
-
if store_type:
|
1086
|
+
if store_type is not None:
|
1087
1087
|
store = self.stores[store_type]
|
1088
|
-
is_sky_managed = store.is_sky_managed
|
1089
1088
|
# We delete a store from the cloud if it's sky managed. Else just
|
1090
1089
|
# remove handle and return
|
1091
|
-
if is_sky_managed:
|
1090
|
+
if store.is_sky_managed:
|
1092
1091
|
self.handle.remove_store(store)
|
1093
1092
|
store.delete()
|
1094
1093
|
# Check remaining stores - if none is sky managed, remove
|
1095
1094
|
# the storage from global_user_state.
|
1096
|
-
delete = all(
|
1097
|
-
s.is_sky_managed is False for s in self.stores.values())
|
1095
|
+
delete = all(not s.is_sky_managed for s in self.stores.values())
|
1098
1096
|
if delete:
|
1099
1097
|
global_user_state.remove_storage(self.name)
|
1100
1098
|
else:
|
@@ -1689,6 +1687,9 @@ class S3Store(AbstractStore):
|
|
1689
1687
|
|
1690
1688
|
Returns:
|
1691
1689
|
bool; True if bucket was deleted, False if it was deleted externally.
|
1690
|
+
|
1691
|
+
Raises:
|
1692
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
1692
1693
|
"""
|
1693
1694
|
# Deleting objects is very slow programatically
|
1694
1695
|
# (i.e. bucket.objects.all().delete() is slow).
|
@@ -2179,6 +2180,11 @@ class GcsStore(AbstractStore):
|
|
2179
2180
|
|
2180
2181
|
Returns:
|
2181
2182
|
bool; True if bucket was deleted, False if it was deleted externally.
|
2183
|
+
|
2184
|
+
Raises:
|
2185
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
2186
|
+
PermissionError: If the bucket is external and the user is not
|
2187
|
+
allowed to delete it.
|
2182
2188
|
"""
|
2183
2189
|
if _bucket_sub_path is not None:
|
2184
2190
|
command_suffix = f'/{_bucket_sub_path}'
|
@@ -3478,6 +3484,9 @@ class R2Store(AbstractStore):
|
|
3478
3484
|
|
3479
3485
|
Returns:
|
3480
3486
|
bool; True if bucket was deleted, False if it was deleted externally.
|
3487
|
+
|
3488
|
+
Raises:
|
3489
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
3481
3490
|
"""
|
3482
3491
|
# Deleting objects is very slow programatically
|
3483
3492
|
# (i.e. bucket.objects.all().delete() is slow).
|
@@ -3932,7 +3941,7 @@ class IBMCosStore(AbstractStore):
|
|
3932
3941
|
|
3933
3942
|
def _delete_cos_bucket_objects(self,
|
3934
3943
|
bucket: Any,
|
3935
|
-
prefix: Optional[str] = None):
|
3944
|
+
prefix: Optional[str] = None) -> None:
|
3936
3945
|
bucket_versioning = self.s3_resource.BucketVersioning(bucket.name)
|
3937
3946
|
if bucket_versioning.status == 'Enabled':
|
3938
3947
|
if prefix is not None:
|
@@ -3947,7 +3956,7 @@ class IBMCosStore(AbstractStore):
|
|
3947
3956
|
res = list(bucket.objects.delete())
|
3948
3957
|
logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}')
|
3949
3958
|
|
3950
|
-
def _delete_cos_bucket(self):
|
3959
|
+
def _delete_cos_bucket(self) -> None:
|
3951
3960
|
bucket = self.s3_resource.Bucket(self.name)
|
3952
3961
|
try:
|
3953
3962
|
self._delete_cos_bucket_objects(bucket)
|
sky/global_user_state.py
CHANGED
@@ -827,7 +827,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
|
|
827
827
|
|
828
828
|
|
829
829
|
def get_storage() -> List[Dict[str, Any]]:
|
830
|
-
rows = _DB.cursor.execute('
|
830
|
+
rows = _DB.cursor.execute('SELECT * FROM storage')
|
831
831
|
records = []
|
832
832
|
for name, launched_at, handle, last_use, status in rows:
|
833
833
|
# TODO: use namedtuple instead of dict
|
sky/jobs/constants.py
CHANGED
@@ -2,18 +2,19 @@
|
|
2
2
|
|
3
3
|
JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
|
4
4
|
JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
|
5
|
+
JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
5
6
|
|
6
7
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
7
8
|
|
8
9
|
# Resources as a dict for the jobs controller.
|
9
|
-
# Use
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
11
|
+
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
12
|
+
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
13
|
+
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
14
|
+
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
15
|
+
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
15
16
|
# We use 50 GB disk size to reduce the cost.
|
16
|
-
CONTROLLER_RESOURCES = {'cpus': '
|
17
|
+
CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
|
17
18
|
|
18
19
|
# Max length of the cluster name for GCP is 35, the user hash to be attached is
|
19
20
|
# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
|
sky/jobs/controller.py
CHANGED
@@ -16,6 +16,7 @@ from sky import status_lib
|
|
16
16
|
from sky.backends import backend_utils
|
17
17
|
from sky.backends import cloud_vm_ray_backend
|
18
18
|
from sky.jobs import recovery_strategy
|
19
|
+
from sky.jobs import scheduler
|
19
20
|
from sky.jobs import state as managed_job_state
|
20
21
|
from sky.jobs import utils as managed_job_utils
|
21
22
|
from sky.skylet import constants
|
@@ -46,12 +47,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
|
|
46
47
|
class JobsController:
|
47
48
|
"""Each jobs controller manages the life cycle of one managed job."""
|
48
49
|
|
49
|
-
def __init__(self, job_id: int, dag_yaml: str
|
50
|
-
retry_until_up: bool) -> None:
|
50
|
+
def __init__(self, job_id: int, dag_yaml: str) -> None:
|
51
51
|
self._job_id = job_id
|
52
52
|
self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
|
53
53
|
logger.info(self._dag)
|
54
|
-
self._retry_until_up = retry_until_up
|
55
54
|
# TODO(zhwu): this assumes the specific backend.
|
56
55
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
57
56
|
|
@@ -174,7 +173,7 @@ class JobsController:
|
|
174
173
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
175
174
|
task.name, self._job_id)
|
176
175
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
177
|
-
cluster_name, self._backend, task, self.
|
176
|
+
cluster_name, self._backend, task, self._job_id)
|
178
177
|
managed_job_state.set_submitted(
|
179
178
|
self._job_id,
|
180
179
|
task_id,
|
@@ -202,6 +201,7 @@ class JobsController:
|
|
202
201
|
task_id=task_id,
|
203
202
|
start_time=remote_job_submitted_at,
|
204
203
|
callback_func=callback_func)
|
204
|
+
|
205
205
|
while True:
|
206
206
|
time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
|
207
207
|
|
@@ -243,7 +243,7 @@ class JobsController:
|
|
243
243
|
self._download_log_and_stream(task_id, handle)
|
244
244
|
# Only clean up the cluster, not the storages, because tasks may
|
245
245
|
# share storages.
|
246
|
-
|
246
|
+
managed_job_utils.terminate_cluster(cluster_name=cluster_name)
|
247
247
|
return True
|
248
248
|
|
249
249
|
# For single-node jobs, non-terminated job_status indicates a
|
@@ -342,7 +342,7 @@ class JobsController:
|
|
342
342
|
# those clusters again may fail.
|
343
343
|
logger.info('Cleaning up the preempted or failed cluster'
|
344
344
|
'...')
|
345
|
-
|
345
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
346
346
|
|
347
347
|
# Try to recover the managed jobs, when the cluster is preempted or
|
348
348
|
# failed or the job status is failed to be fetched.
|
@@ -424,11 +424,11 @@ class JobsController:
|
|
424
424
|
task=self._dag.tasks[task_id]))
|
425
425
|
|
426
426
|
|
427
|
-
def _run_controller(job_id: int, dag_yaml: str
|
427
|
+
def _run_controller(job_id: int, dag_yaml: str):
|
428
428
|
"""Runs the controller in a remote process for interruption."""
|
429
429
|
# The controller needs to be instantiated in the remote process, since
|
430
430
|
# the controller is not serializable.
|
431
|
-
jobs_controller = JobsController(job_id, dag_yaml
|
431
|
+
jobs_controller = JobsController(job_id, dag_yaml)
|
432
432
|
jobs_controller.run()
|
433
433
|
|
434
434
|
|
@@ -478,14 +478,14 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
478
478
|
assert task.name is not None, task
|
479
479
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
480
480
|
task.name, job_id)
|
481
|
-
|
481
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
482
482
|
# Clean up Storages with persistent=False.
|
483
483
|
# TODO(zhwu): this assumes the specific backend.
|
484
484
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
485
485
|
backend.teardown_ephemeral_storage(task)
|
486
486
|
|
487
487
|
|
488
|
-
def start(job_id, dag_yaml
|
488
|
+
def start(job_id, dag_yaml):
|
489
489
|
"""Start the controller."""
|
490
490
|
controller_process = None
|
491
491
|
cancelling = False
|
@@ -499,8 +499,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
499
499
|
# So we can only enable daemon after we no longer need to
|
500
500
|
# start daemon processes like Ray.
|
501
501
|
controller_process = multiprocessing.Process(target=_run_controller,
|
502
|
-
args=(job_id, dag_yaml
|
503
|
-
retry_until_up))
|
502
|
+
args=(job_id, dag_yaml))
|
504
503
|
controller_process.start()
|
505
504
|
while controller_process.is_alive():
|
506
505
|
_handle_signal(job_id)
|
@@ -562,6 +561,8 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
562
561
|
failure_reason=('Unexpected error occurred. For details, '
|
563
562
|
f'run: sky jobs logs --controller {job_id}'))
|
564
563
|
|
564
|
+
scheduler.job_done(job_id)
|
565
|
+
|
565
566
|
|
566
567
|
if __name__ == '__main__':
|
567
568
|
parser = argparse.ArgumentParser()
|
@@ -569,9 +570,6 @@ if __name__ == '__main__':
|
|
569
570
|
required=True,
|
570
571
|
type=int,
|
571
572
|
help='Job id for the controller job.')
|
572
|
-
parser.add_argument('--retry-until-up',
|
573
|
-
action='store_true',
|
574
|
-
help='Retry until the cluster is up.')
|
575
573
|
parser.add_argument('dag_yaml',
|
576
574
|
type=str,
|
577
575
|
help='The path to the user job yaml file.')
|
@@ -579,4 +577,4 @@ if __name__ == '__main__':
|
|
579
577
|
# We start process with 'spawn', because 'fork' could result in weird
|
580
578
|
# behaviors; 'spawn' is also cross-platform.
|
581
579
|
multiprocessing.set_start_method('spawn', force=True)
|
582
|
-
start(args.job_id, args.dag_yaml
|
580
|
+
start(args.job_id, args.dag_yaml)
|
sky/jobs/core.py
CHANGED
@@ -41,7 +41,6 @@ def launch(
|
|
41
41
|
name: Optional[str] = None,
|
42
42
|
stream_logs: bool = True,
|
43
43
|
detach_run: bool = False,
|
44
|
-
retry_until_up: bool = False,
|
45
44
|
# TODO(cooperc): remove fast arg before 0.8.0
|
46
45
|
fast: bool = True, # pylint: disable=unused-argument for compatibility
|
47
46
|
) -> None:
|
@@ -115,7 +114,6 @@ def launch(
|
|
115
114
|
'jobs_controller': controller_name,
|
116
115
|
# Note: actual cluster name will be <task.name>-<managed job ID>
|
117
116
|
'dag_name': dag.name,
|
118
|
-
'retry_until_up': retry_until_up,
|
119
117
|
'remote_user_config_path': remote_user_config_path,
|
120
118
|
'modified_catalogs':
|
121
119
|
service_catalog_common.get_modified_catalog_file_mounts(),
|