skypilot-nightly 1.0.0.dev20250115__py3-none-any.whl → 1.0.0.dev20250117__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -67
- sky/cli.py +11 -34
- sky/core.py +8 -5
- sky/data/storage.py +66 -14
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +257 -21
- sky/jobs/utils.py +338 -96
- sky/provision/kubernetes/instance.py +1 -1
- sky/resources.py +1 -1
- sky/serve/core.py +30 -5
- sky/serve/replica_managers.py +1 -3
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/resources_utils.py +25 -21
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/RECORD +33 -32
- {skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250115.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '9e1b4ddc5fb1cb3fd6c00c106555b919e449e2c9'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250117'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -10,6 +10,7 @@ import os
|
|
10
10
|
import pathlib
|
11
11
|
import re
|
12
12
|
import shlex
|
13
|
+
import shutil
|
13
14
|
import signal
|
14
15
|
import subprocess
|
15
16
|
import sys
|
@@ -35,7 +36,6 @@ from sky import jobs as managed_jobs
|
|
35
36
|
from sky import optimizer
|
36
37
|
from sky import provision as provision_lib
|
37
38
|
from sky import resources as resources_lib
|
38
|
-
from sky import serve as serve_lib
|
39
39
|
from sky import sky_logging
|
40
40
|
from sky import status_lib
|
41
41
|
from sky import task as task_lib
|
@@ -45,6 +45,7 @@ from sky.clouds import service_catalog
|
|
45
45
|
from sky.clouds.utils import gcp_utils
|
46
46
|
from sky.data import data_utils
|
47
47
|
from sky.data import storage as storage_lib
|
48
|
+
from sky.jobs import constants as managed_jobs_constants
|
48
49
|
from sky.provision import common as provision_common
|
49
50
|
from sky.provision import instance_setup
|
50
51
|
from sky.provision import metadata_utils
|
@@ -155,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
155
156
|
# might be added during ssh.
|
156
157
|
_MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
|
157
158
|
|
159
|
+
_RESOURCES_UNAVAILABLE_LOG = (
|
160
|
+
'Reasons for provision failures (for details, please check the log above):')
|
161
|
+
|
158
162
|
|
159
163
|
def _is_command_length_over_limit(command: str) -> bool:
|
160
164
|
"""Check if the length of the command exceeds the limit.
|
@@ -1997,6 +2001,7 @@ class RetryingVmProvisioner(object):
|
|
1997
2001
|
skip_unnecessary_provisioning else None)
|
1998
2002
|
|
1999
2003
|
failover_history: List[Exception] = list()
|
2004
|
+
resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
|
2000
2005
|
# If the user is using local credentials which may expire, the
|
2001
2006
|
# controller may leak resources if the credentials expire while a job
|
2002
2007
|
# is running. Here we check the enabled clouds and expiring credentials
|
@@ -2088,6 +2093,8 @@ class RetryingVmProvisioner(object):
|
|
2088
2093
|
# Add failed resources to the blocklist, only when it
|
2089
2094
|
# is in fallback mode.
|
2090
2095
|
_add_to_blocked_resources(self._blocked_resources, to_provision)
|
2096
|
+
assert len(failover_history) > 0
|
2097
|
+
resource_exceptions[to_provision] = failover_history[-1]
|
2091
2098
|
else:
|
2092
2099
|
# If we reach here, it means that the existing cluster must have
|
2093
2100
|
# a previous status of INIT, because other statuses (UP,
|
@@ -2132,7 +2139,14 @@ class RetryingVmProvisioner(object):
|
|
2132
2139
|
# possible resources or the requested resources is too
|
2133
2140
|
# restrictive. If we reach here, our failover logic finally
|
2134
2141
|
# ends here.
|
2135
|
-
|
2142
|
+
table = log_utils.create_table(['Resource', 'Reason'])
|
2143
|
+
for (resource, exception) in resource_exceptions.items():
|
2144
|
+
table.add_row(
|
2145
|
+
[resources_utils.format_resource(resource), exception])
|
2146
|
+
table.max_table_width = shutil.get_terminal_size().columns
|
2147
|
+
raise exceptions.ResourcesUnavailableError(
|
2148
|
+
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
2149
|
+
failover_history=failover_history)
|
2136
2150
|
to_provision = task.best_resources
|
2137
2151
|
assert task in self._dag.tasks, 'Internal logic error.'
|
2138
2152
|
assert to_provision is not None, task
|
@@ -2895,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2895
2909
|
'the `--retry-until-up` flag.')
|
2896
2910
|
with ux_utils.print_exception_no_traceback():
|
2897
2911
|
raise exceptions.ResourcesUnavailableError(
|
2898
|
-
error_message,
|
2912
|
+
error_message + '\n' + str(e),
|
2899
2913
|
failover_history=e.failover_history) from None
|
2900
2914
|
if dryrun:
|
2901
2915
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
@@ -3910,40 +3924,45 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3910
3924
|
Returns:
|
3911
3925
|
A dictionary mapping job_id to log path.
|
3912
3926
|
"""
|
3913
|
-
# if job_name
|
3927
|
+
# if job_name and job_id should not both be specified
|
3914
3928
|
assert job_name is None or job_id is None, (job_name, job_id)
|
3915
|
-
|
3929
|
+
|
3930
|
+
if job_id is None:
|
3916
3931
|
# generate code to get the job_id
|
3932
|
+
# if job_name is None, get all job_ids
|
3933
|
+
# TODO: Only get the latest job_id, since that's the only one we use
|
3917
3934
|
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
3918
3935
|
job_name=job_name)
|
3919
|
-
returncode,
|
3920
|
-
|
3921
|
-
|
3922
|
-
|
3923
|
-
|
3924
|
-
separate_stderr=True)
|
3936
|
+
returncode, job_ids, stderr = self.run_on_head(handle,
|
3937
|
+
code,
|
3938
|
+
stream_logs=False,
|
3939
|
+
require_outputs=True,
|
3940
|
+
separate_stderr=True)
|
3925
3941
|
subprocess_utils.handle_returncode(returncode, code,
|
3926
3942
|
'Failed to sync down logs.',
|
3927
3943
|
stderr)
|
3928
|
-
job_ids = common_utils.decode_payload(
|
3944
|
+
job_ids = common_utils.decode_payload(job_ids)
|
3929
3945
|
if not job_ids:
|
3930
3946
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3931
3947
|
'No matching job found'
|
3932
3948
|
f'{colorama.Style.RESET_ALL}')
|
3933
3949
|
return {}
|
3934
3950
|
elif len(job_ids) > 1:
|
3935
|
-
|
3936
|
-
|
3937
|
-
|
3938
|
-
|
3939
|
-
|
3940
|
-
|
3941
|
-
|
3942
|
-
|
3951
|
+
name_str = ''
|
3952
|
+
if job_name is not None:
|
3953
|
+
name_str = ('Multiple jobs IDs found under the name '
|
3954
|
+
f'{job_name}. ')
|
3955
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3956
|
+
f'{name_str}'
|
3957
|
+
'Downloading the latest job logs.'
|
3958
|
+
f'{colorama.Style.RESET_ALL}')
|
3959
|
+
# list should aready be in descending order
|
3960
|
+
job_id = job_ids[0]
|
3943
3961
|
|
3944
3962
|
# get the run_timestamp
|
3945
3963
|
# the function takes in [job_id]
|
3946
|
-
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3964
|
+
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3965
|
+
[str(job_id)])
|
3947
3966
|
returncode, run_timestamps, stderr = self.run_on_head(
|
3948
3967
|
handle,
|
3949
3968
|
code,
|
@@ -3964,13 +3983,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3964
3983
|
job_id = list(run_timestamps.keys())[0]
|
3965
3984
|
local_log_dir = ''
|
3966
3985
|
if controller: # download controller logs
|
3967
|
-
|
3968
|
-
|
3986
|
+
remote_log = os.path.join(
|
3987
|
+
managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
|
3988
|
+
f'{job_id}.log')
|
3969
3989
|
local_log_dir = os.path.expanduser(
|
3970
3990
|
os.path.join(local_dir, run_timestamp))
|
3971
3991
|
|
3972
3992
|
logger.info(f'{colorama.Fore.CYAN}'
|
3973
|
-
f'Job {
|
3993
|
+
f'Job {job_id} local logs: {local_log_dir}'
|
3974
3994
|
f'{colorama.Style.RESET_ALL}')
|
3975
3995
|
|
3976
3996
|
runners = handle.get_command_runners()
|
@@ -3981,12 +4001,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3981
4001
|
Args:
|
3982
4002
|
args: A tuple of (runner, local_log_dir, remote_log_dir)
|
3983
4003
|
"""
|
3984
|
-
(runner, local_log_dir,
|
4004
|
+
(runner, local_log_dir, remote_log) = args
|
3985
4005
|
try:
|
3986
4006
|
os.makedirs(local_log_dir, exist_ok=True)
|
3987
4007
|
runner.rsync(
|
3988
|
-
source=
|
3989
|
-
target=local_log_dir,
|
4008
|
+
source=remote_log,
|
4009
|
+
target=f'{local_log_dir}/controller.log',
|
3990
4010
|
up=False,
|
3991
4011
|
stream_logs=False,
|
3992
4012
|
)
|
@@ -3999,9 +4019,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3999
4019
|
else:
|
4000
4020
|
raise
|
4001
4021
|
|
4002
|
-
parallel_args = [
|
4003
|
-
|
4004
|
-
|
4022
|
+
parallel_args = [
|
4023
|
+
(runner, local_log_dir, remote_log) for runner in runners
|
4024
|
+
]
|
4005
4025
|
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
4006
4026
|
else: # download job logs
|
4007
4027
|
local_log_dir = os.path.expanduser(
|
@@ -4037,43 +4057,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4037
4057
|
f'{colorama.Style.RESET_ALL}')
|
4038
4058
|
return {str(job_id): local_log_dir}
|
4039
4059
|
|
4040
|
-
def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
|
4041
|
-
service_name: str, target: serve_lib.ServiceComponent,
|
4042
|
-
replica_id: Optional[int], follow: bool) -> None:
|
4043
|
-
"""Tail the logs of a service.
|
4044
|
-
|
4045
|
-
Args:
|
4046
|
-
handle: The handle to the sky serve controller.
|
4047
|
-
service_name: The name of the service.
|
4048
|
-
target: The component to tail the logs of. Could be controller,
|
4049
|
-
load balancer, or replica.
|
4050
|
-
replica_id: The replica ID to tail the logs of. Only used when
|
4051
|
-
target is replica.
|
4052
|
-
follow: Whether to follow the logs.
|
4053
|
-
"""
|
4054
|
-
if target != serve_lib.ServiceComponent.REPLICA:
|
4055
|
-
code = serve_lib.ServeCodeGen.stream_serve_process_logs(
|
4056
|
-
service_name,
|
4057
|
-
stream_controller=(
|
4058
|
-
target == serve_lib.ServiceComponent.CONTROLLER),
|
4059
|
-
follow=follow)
|
4060
|
-
else:
|
4061
|
-
assert replica_id is not None, service_name
|
4062
|
-
code = serve_lib.ServeCodeGen.stream_replica_logs(
|
4063
|
-
service_name, replica_id, follow)
|
4064
|
-
|
4065
|
-
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
4066
|
-
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
4067
|
-
|
4068
|
-
self.run_on_head(
|
4069
|
-
handle,
|
4070
|
-
code,
|
4071
|
-
stream_logs=True,
|
4072
|
-
process_stream=False,
|
4073
|
-
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
4074
|
-
stdin=subprocess.DEVNULL,
|
4075
|
-
)
|
4076
|
-
|
4077
4060
|
def teardown_no_lock(self,
|
4078
4061
|
handle: CloudVmRayResourceHandle,
|
4079
4062
|
terminate: bool,
|
sky/cli.py
CHANGED
@@ -3530,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3530
3530
|
if sum([bool(names), all]) != 1:
|
3531
3531
|
raise click.UsageError('Either --all or a name must be specified.')
|
3532
3532
|
if all:
|
3533
|
-
|
3534
|
-
|
3533
|
+
# Use '*' to get all storages.
|
3534
|
+
names = global_user_state.get_glob_storage_name(storage_name='*')
|
3535
|
+
if not names:
|
3535
3536
|
click.echo('No storage(s) to delete.')
|
3536
3537
|
return
|
3537
|
-
names = [s['name'] for s in storages]
|
3538
3538
|
else:
|
3539
3539
|
names = _get_glob_storages(names)
|
3540
3540
|
if names:
|
@@ -3548,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3548
3548
|
abort=True,
|
3549
3549
|
show_default=True)
|
3550
3550
|
|
3551
|
-
|
3551
|
+
def delete_storage(name: str) -> None:
|
3552
|
+
try:
|
3553
|
+
sky.storage_delete(name)
|
3554
|
+
except Exception as e: # pylint: disable=broad-except
|
3555
|
+
click.secho(f'Error deleting storage {name}: {e}', fg='red')
|
3556
|
+
|
3557
|
+
subprocess_utils.run_in_parallel(delete_storage, names)
|
3552
3558
|
|
3553
3559
|
|
3554
3560
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -3588,18 +3594,6 @@ def jobs():
|
|
3588
3594
|
is_flag=True,
|
3589
3595
|
help=('If True, as soon as a job is submitted, return from this call '
|
3590
3596
|
'and do not stream execution logs.'))
|
3591
|
-
@click.option(
|
3592
|
-
'--retry-until-up/--no-retry-until-up',
|
3593
|
-
'-r/-no-r',
|
3594
|
-
default=None,
|
3595
|
-
is_flag=True,
|
3596
|
-
required=False,
|
3597
|
-
help=(
|
3598
|
-
'(Default: True; this flag is deprecated and will be removed in a '
|
3599
|
-
'future release.) Whether to retry provisioning infinitely until the '
|
3600
|
-
'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
|
3601
|
-
'applies to launching all managed jobs (both the initial and '
|
3602
|
-
'any recovery attempts), not the jobs controller.'))
|
3603
3597
|
@click.option('--yes',
|
3604
3598
|
'-y',
|
3605
3599
|
is_flag=True,
|
@@ -3636,7 +3630,6 @@ def jobs_launch(
|
|
3636
3630
|
disk_tier: Optional[str],
|
3637
3631
|
ports: Tuple[str],
|
3638
3632
|
detach_run: bool,
|
3639
|
-
retry_until_up: Optional[bool],
|
3640
3633
|
yes: bool,
|
3641
3634
|
fast: bool,
|
3642
3635
|
):
|
@@ -3680,19 +3673,6 @@ def jobs_launch(
|
|
3680
3673
|
ports=ports,
|
3681
3674
|
job_recovery=job_recovery,
|
3682
3675
|
)
|
3683
|
-
# Deprecation. We set the default behavior to be retry until up, and the
|
3684
|
-
# flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
|
3685
|
-
if retry_until_up is not None:
|
3686
|
-
flag_str = '--retry-until-up'
|
3687
|
-
if not retry_until_up:
|
3688
|
-
flag_str = '--no-retry-until-up'
|
3689
|
-
click.secho(
|
3690
|
-
f'Flag {flag_str} is deprecated and will be removed in a '
|
3691
|
-
'future release (managed jobs will always be retried). '
|
3692
|
-
'Please file an issue if this does not work for you.',
|
3693
|
-
fg='yellow')
|
3694
|
-
else:
|
3695
|
-
retry_until_up = True
|
3696
3676
|
|
3697
3677
|
# Deprecation. The default behavior is fast, and the flag will be removed.
|
3698
3678
|
# The flag was not present in 0.7.x (only nightly), so we will remove before
|
@@ -3742,10 +3722,7 @@ def jobs_launch(
|
|
3742
3722
|
|
3743
3723
|
common_utils.check_cluster_name_is_valid(name)
|
3744
3724
|
|
3745
|
-
managed_jobs.launch(dag,
|
3746
|
-
name,
|
3747
|
-
detach_run=detach_run,
|
3748
|
-
retry_until_up=retry_until_up)
|
3725
|
+
managed_jobs.launch(dag, name, detach_run=detach_run)
|
3749
3726
|
|
3750
3727
|
|
3751
3728
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
sky/core.py
CHANGED
@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
|
|
915
915
|
handle = global_user_state.get_handle_from_storage_name(name)
|
916
916
|
if handle is None:
|
917
917
|
raise ValueError(f'Storage name {name!r} not found.')
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
918
|
+
|
919
|
+
assert handle.storage_name == name, (
|
920
|
+
f'In global_user_state, storage name {name!r} does not match '
|
921
|
+
f'handle.storage_name {handle.storage_name!r}')
|
922
|
+
storage_object = data.Storage(name=handle.storage_name,
|
923
|
+
source=handle.source,
|
924
|
+
sync_on_reconstruction=False)
|
925
|
+
storage_object.delete()
|
sky/data/storage.py
CHANGED
@@ -1083,18 +1083,16 @@ class Storage(object):
|
|
1083
1083
|
if not self.stores:
|
1084
1084
|
logger.info('No backing stores found. Deleting storage.')
|
1085
1085
|
global_user_state.remove_storage(self.name)
|
1086
|
-
if store_type:
|
1086
|
+
if store_type is not None:
|
1087
1087
|
store = self.stores[store_type]
|
1088
|
-
is_sky_managed = store.is_sky_managed
|
1089
1088
|
# We delete a store from the cloud if it's sky managed. Else just
|
1090
1089
|
# remove handle and return
|
1091
|
-
if is_sky_managed:
|
1090
|
+
if store.is_sky_managed:
|
1092
1091
|
self.handle.remove_store(store)
|
1093
1092
|
store.delete()
|
1094
1093
|
# Check remaining stores - if none is sky managed, remove
|
1095
1094
|
# the storage from global_user_state.
|
1096
|
-
delete = all(
|
1097
|
-
s.is_sky_managed is False for s in self.stores.values())
|
1095
|
+
delete = all(not s.is_sky_managed for s in self.stores.values())
|
1098
1096
|
if delete:
|
1099
1097
|
global_user_state.remove_storage(self.name)
|
1100
1098
|
else:
|
@@ -1689,6 +1687,9 @@ class S3Store(AbstractStore):
|
|
1689
1687
|
|
1690
1688
|
Returns:
|
1691
1689
|
bool; True if bucket was deleted, False if it was deleted externally.
|
1690
|
+
|
1691
|
+
Raises:
|
1692
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
1692
1693
|
"""
|
1693
1694
|
# Deleting objects is very slow programatically
|
1694
1695
|
# (i.e. bucket.objects.all().delete() is slow).
|
@@ -2179,6 +2180,11 @@ class GcsStore(AbstractStore):
|
|
2179
2180
|
|
2180
2181
|
Returns:
|
2181
2182
|
bool; True if bucket was deleted, False if it was deleted externally.
|
2183
|
+
|
2184
|
+
Raises:
|
2185
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
2186
|
+
PermissionError: If the bucket is external and the user is not
|
2187
|
+
allowed to delete it.
|
2182
2188
|
"""
|
2183
2189
|
if _bucket_sub_path is not None:
|
2184
2190
|
command_suffix = f'/{_bucket_sub_path}'
|
@@ -3478,6 +3484,9 @@ class R2Store(AbstractStore):
|
|
3478
3484
|
|
3479
3485
|
Returns:
|
3480
3486
|
bool; True if bucket was deleted, False if it was deleted externally.
|
3487
|
+
|
3488
|
+
Raises:
|
3489
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
3481
3490
|
"""
|
3482
3491
|
# Deleting objects is very slow programatically
|
3483
3492
|
# (i.e. bucket.objects.all().delete() is slow).
|
@@ -3932,7 +3941,7 @@ class IBMCosStore(AbstractStore):
|
|
3932
3941
|
|
3933
3942
|
def _delete_cos_bucket_objects(self,
|
3934
3943
|
bucket: Any,
|
3935
|
-
prefix: Optional[str] = None):
|
3944
|
+
prefix: Optional[str] = None) -> None:
|
3936
3945
|
bucket_versioning = self.s3_resource.BucketVersioning(bucket.name)
|
3937
3946
|
if bucket_versioning.status == 'Enabled':
|
3938
3947
|
if prefix is not None:
|
@@ -3947,7 +3956,7 @@ class IBMCosStore(AbstractStore):
|
|
3947
3956
|
res = list(bucket.objects.delete())
|
3948
3957
|
logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}')
|
3949
3958
|
|
3950
|
-
def _delete_cos_bucket(self):
|
3959
|
+
def _delete_cos_bucket(self) -> None:
|
3951
3960
|
bucket = self.s3_resource.Bucket(self.name)
|
3952
3961
|
try:
|
3953
3962
|
self._delete_cos_bucket_objects(bucket)
|
@@ -3968,7 +3977,7 @@ class OciStore(AbstractStore):
|
|
3968
3977
|
|
3969
3978
|
def __init__(self,
|
3970
3979
|
name: str,
|
3971
|
-
source:
|
3980
|
+
source: Optional[SourceType],
|
3972
3981
|
region: Optional[str] = None,
|
3973
3982
|
is_sky_managed: Optional[bool] = None,
|
3974
3983
|
sync_on_reconstruction: Optional[bool] = True,
|
@@ -3980,13 +3989,53 @@ class OciStore(AbstractStore):
|
|
3980
3989
|
self.compartment: str
|
3981
3990
|
self.namespace: str
|
3982
3991
|
|
3983
|
-
#
|
3984
|
-
|
3992
|
+
# Region is from the specified name in <bucket>@<region> format.
|
3993
|
+
# Another case is name can also be set by the source, for example:
|
3994
|
+
# /datasets-storage:
|
3995
|
+
# source: oci://RAGData@us-sanjose-1
|
3996
|
+
# The name in above mount will be set to RAGData@us-sanjose-1
|
3997
|
+
region_in_name = None
|
3998
|
+
if name is not None and '@' in name:
|
3999
|
+
self._validate_bucket_expr(name)
|
4000
|
+
name, region_in_name = name.split('@')
|
4001
|
+
|
4002
|
+
# Region is from the specified source in oci://<bucket>@<region> format
|
4003
|
+
region_in_source = None
|
4004
|
+
if isinstance(source,
|
4005
|
+
str) and source.startswith('oci://') and '@' in source:
|
4006
|
+
self._validate_bucket_expr(source)
|
4007
|
+
source, region_in_source = source.split('@')
|
4008
|
+
|
4009
|
+
if region_in_name is not None and region_in_source is not None:
|
4010
|
+
# This should never happen because name and source will never be
|
4011
|
+
# the remote bucket at the same time.
|
4012
|
+
assert region_in_name == region_in_source, (
|
4013
|
+
f'Mismatch region specified. Region in name {region_in_name}, '
|
4014
|
+
f'but region in source is {region_in_source}')
|
4015
|
+
|
4016
|
+
if region_in_name is not None:
|
4017
|
+
region = region_in_name
|
4018
|
+
elif region_in_source is not None:
|
4019
|
+
region = region_in_source
|
4020
|
+
|
4021
|
+
# Default region set to what specified in oci config.
|
4022
|
+
if region is None:
|
4023
|
+
region = oci.get_oci_config()['region']
|
4024
|
+
|
4025
|
+
# So far from now on, the name and source are canonical, means there
|
4026
|
+
# is no region (@<region> suffix) associated with them anymore.
|
3985
4027
|
|
3986
4028
|
super().__init__(name, source, region, is_sky_managed,
|
3987
4029
|
sync_on_reconstruction, _bucket_sub_path)
|
3988
4030
|
# TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands
|
3989
4031
|
|
4032
|
+
def _validate_bucket_expr(self, bucket_expr: str):
|
4033
|
+
pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$'
|
4034
|
+
if not re.match(pattern, bucket_expr):
|
4035
|
+
raise ValueError(
|
4036
|
+
'The format for the bucket portion is <bucket>@<region> '
|
4037
|
+
'when specify a region with a bucket.')
|
4038
|
+
|
3990
4039
|
def _validate(self):
|
3991
4040
|
if self.source is not None and isinstance(self.source, str):
|
3992
4041
|
if self.source.startswith('oci://'):
|
@@ -4137,7 +4186,8 @@ class OciStore(AbstractStore):
|
|
4137
4186
|
sync_command = (
|
4138
4187
|
'oci os object bulk-upload --no-follow-symlinks --overwrite '
|
4139
4188
|
f'--bucket-name {self.name} --namespace-name {self.namespace} '
|
4140
|
-
f'--src-dir "{base_dir_path}"
|
4189
|
+
f'--region {self.region} --src-dir "{base_dir_path}" '
|
4190
|
+
f'{includes}')
|
4141
4191
|
|
4142
4192
|
return sync_command
|
4143
4193
|
|
@@ -4157,8 +4207,8 @@ class OciStore(AbstractStore):
|
|
4157
4207
|
sync_command = (
|
4158
4208
|
'oci os object bulk-upload --no-follow-symlinks --overwrite '
|
4159
4209
|
f'--bucket-name {self.name} --namespace-name {self.namespace} '
|
4160
|
-
f'--
|
4161
|
-
f'{excludes}
|
4210
|
+
f'--region {self.region} --object-prefix "{dest_dir_name}" '
|
4211
|
+
f'--src-dir "{src_dir_path}" {excludes}')
|
4162
4212
|
|
4163
4213
|
return sync_command
|
4164
4214
|
|
@@ -4289,7 +4339,8 @@ class OciStore(AbstractStore):
|
|
4289
4339
|
def get_file_download_command(remote_path, local_path):
|
4290
4340
|
download_command = (f'oci os object get --bucket-name {self.name} '
|
4291
4341
|
f'--namespace-name {self.namespace} '
|
4292
|
-
f'--
|
4342
|
+
f'--region {self.region} --name {remote_path} '
|
4343
|
+
f'--file {local_path}')
|
4293
4344
|
|
4294
4345
|
return download_command
|
4295
4346
|
|
@@ -4346,6 +4397,7 @@ class OciStore(AbstractStore):
|
|
4346
4397
|
@oci.with_oci_env
|
4347
4398
|
def get_bucket_delete_command(bucket_name):
|
4348
4399
|
remove_command = (f'oci os bucket delete --bucket-name '
|
4400
|
+
f'--region {self.region} '
|
4349
4401
|
f'{bucket_name} --empty --force')
|
4350
4402
|
|
4351
4403
|
return remove_command
|
sky/global_user_state.py
CHANGED
@@ -827,7 +827,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
|
|
827
827
|
|
828
828
|
|
829
829
|
def get_storage() -> List[Dict[str, Any]]:
|
830
|
-
rows = _DB.cursor.execute('
|
830
|
+
rows = _DB.cursor.execute('SELECT * FROM storage')
|
831
831
|
records = []
|
832
832
|
for name, launched_at, handle, last_use, status in rows:
|
833
833
|
# TODO: use namedtuple instead of dict
|
sky/jobs/constants.py
CHANGED
@@ -2,18 +2,19 @@
|
|
2
2
|
|
3
3
|
JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
|
4
4
|
JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
|
5
|
+
JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
5
6
|
|
6
7
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
7
8
|
|
8
9
|
# Resources as a dict for the jobs controller.
|
9
|
-
# Use
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
11
|
+
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
12
|
+
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
13
|
+
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
14
|
+
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
15
|
+
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
15
16
|
# We use 50 GB disk size to reduce the cost.
|
16
|
-
CONTROLLER_RESOURCES = {'cpus': '
|
17
|
+
CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
|
17
18
|
|
18
19
|
# Max length of the cluster name for GCP is 35, the user hash to be attached is
|
19
20
|
# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
|