skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -67
- sky/check.py +31 -1
- sky/cli.py +11 -34
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/core.py +8 -5
- sky/data/storage.py +66 -14
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +263 -21
- sky/jobs/utils.py +338 -96
- sky/provision/aws/config.py +48 -26
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +76 -18
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +13 -0
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '1c94d0f001ed6519873a59a7b46681d64dd696d2'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250124'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -10,6 +10,7 @@ import os
|
|
10
10
|
import pathlib
|
11
11
|
import re
|
12
12
|
import shlex
|
13
|
+
import shutil
|
13
14
|
import signal
|
14
15
|
import subprocess
|
15
16
|
import sys
|
@@ -35,7 +36,6 @@ from sky import jobs as managed_jobs
|
|
35
36
|
from sky import optimizer
|
36
37
|
from sky import provision as provision_lib
|
37
38
|
from sky import resources as resources_lib
|
38
|
-
from sky import serve as serve_lib
|
39
39
|
from sky import sky_logging
|
40
40
|
from sky import status_lib
|
41
41
|
from sky import task as task_lib
|
@@ -45,6 +45,7 @@ from sky.clouds import service_catalog
|
|
45
45
|
from sky.clouds.utils import gcp_utils
|
46
46
|
from sky.data import data_utils
|
47
47
|
from sky.data import storage as storage_lib
|
48
|
+
from sky.jobs import constants as managed_jobs_constants
|
48
49
|
from sky.provision import common as provision_common
|
49
50
|
from sky.provision import instance_setup
|
50
51
|
from sky.provision import metadata_utils
|
@@ -155,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
155
156
|
# might be added during ssh.
|
156
157
|
_MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
|
157
158
|
|
159
|
+
_RESOURCES_UNAVAILABLE_LOG = (
|
160
|
+
'Reasons for provision failures (for details, please check the log above):')
|
161
|
+
|
158
162
|
|
159
163
|
def _is_command_length_over_limit(command: str) -> bool:
|
160
164
|
"""Check if the length of the command exceeds the limit.
|
@@ -1997,6 +2001,7 @@ class RetryingVmProvisioner(object):
|
|
1997
2001
|
skip_unnecessary_provisioning else None)
|
1998
2002
|
|
1999
2003
|
failover_history: List[Exception] = list()
|
2004
|
+
resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
|
2000
2005
|
# If the user is using local credentials which may expire, the
|
2001
2006
|
# controller may leak resources if the credentials expire while a job
|
2002
2007
|
# is running. Here we check the enabled clouds and expiring credentials
|
@@ -2088,6 +2093,8 @@ class RetryingVmProvisioner(object):
|
|
2088
2093
|
# Add failed resources to the blocklist, only when it
|
2089
2094
|
# is in fallback mode.
|
2090
2095
|
_add_to_blocked_resources(self._blocked_resources, to_provision)
|
2096
|
+
assert len(failover_history) > 0
|
2097
|
+
resource_exceptions[to_provision] = failover_history[-1]
|
2091
2098
|
else:
|
2092
2099
|
# If we reach here, it means that the existing cluster must have
|
2093
2100
|
# a previous status of INIT, because other statuses (UP,
|
@@ -2132,7 +2139,14 @@ class RetryingVmProvisioner(object):
|
|
2132
2139
|
# possible resources or the requested resources is too
|
2133
2140
|
# restrictive. If we reach here, our failover logic finally
|
2134
2141
|
# ends here.
|
2135
|
-
|
2142
|
+
table = log_utils.create_table(['Resource', 'Reason'])
|
2143
|
+
for (resource, exception) in resource_exceptions.items():
|
2144
|
+
table.add_row(
|
2145
|
+
[resources_utils.format_resource(resource), exception])
|
2146
|
+
table.max_table_width = shutil.get_terminal_size().columns
|
2147
|
+
raise exceptions.ResourcesUnavailableError(
|
2148
|
+
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
2149
|
+
failover_history=failover_history)
|
2136
2150
|
to_provision = task.best_resources
|
2137
2151
|
assert task in self._dag.tasks, 'Internal logic error.'
|
2138
2152
|
assert to_provision is not None, task
|
@@ -2895,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2895
2909
|
'the `--retry-until-up` flag.')
|
2896
2910
|
with ux_utils.print_exception_no_traceback():
|
2897
2911
|
raise exceptions.ResourcesUnavailableError(
|
2898
|
-
error_message,
|
2912
|
+
error_message + '\n' + str(e),
|
2899
2913
|
failover_history=e.failover_history) from None
|
2900
2914
|
if dryrun:
|
2901
2915
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
@@ -3910,40 +3924,45 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3910
3924
|
Returns:
|
3911
3925
|
A dictionary mapping job_id to log path.
|
3912
3926
|
"""
|
3913
|
-
# if job_name
|
3927
|
+
# if job_name and job_id should not both be specified
|
3914
3928
|
assert job_name is None or job_id is None, (job_name, job_id)
|
3915
|
-
|
3929
|
+
|
3930
|
+
if job_id is None:
|
3916
3931
|
# generate code to get the job_id
|
3932
|
+
# if job_name is None, get all job_ids
|
3933
|
+
# TODO: Only get the latest job_id, since that's the only one we use
|
3917
3934
|
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
3918
3935
|
job_name=job_name)
|
3919
|
-
returncode,
|
3920
|
-
|
3921
|
-
|
3922
|
-
|
3923
|
-
|
3924
|
-
separate_stderr=True)
|
3936
|
+
returncode, job_ids, stderr = self.run_on_head(handle,
|
3937
|
+
code,
|
3938
|
+
stream_logs=False,
|
3939
|
+
require_outputs=True,
|
3940
|
+
separate_stderr=True)
|
3925
3941
|
subprocess_utils.handle_returncode(returncode, code,
|
3926
3942
|
'Failed to sync down logs.',
|
3927
3943
|
stderr)
|
3928
|
-
job_ids = common_utils.decode_payload(
|
3944
|
+
job_ids = common_utils.decode_payload(job_ids)
|
3929
3945
|
if not job_ids:
|
3930
3946
|
logger.info(f'{colorama.Fore.YELLOW}'
|
3931
3947
|
'No matching job found'
|
3932
3948
|
f'{colorama.Style.RESET_ALL}')
|
3933
3949
|
return {}
|
3934
3950
|
elif len(job_ids) > 1:
|
3935
|
-
|
3936
|
-
|
3937
|
-
|
3938
|
-
|
3939
|
-
|
3940
|
-
|
3941
|
-
|
3942
|
-
|
3951
|
+
name_str = ''
|
3952
|
+
if job_name is not None:
|
3953
|
+
name_str = ('Multiple jobs IDs found under the name '
|
3954
|
+
f'{job_name}. ')
|
3955
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3956
|
+
f'{name_str}'
|
3957
|
+
'Downloading the latest job logs.'
|
3958
|
+
f'{colorama.Style.RESET_ALL}')
|
3959
|
+
# list should aready be in descending order
|
3960
|
+
job_id = job_ids[0]
|
3943
3961
|
|
3944
3962
|
# get the run_timestamp
|
3945
3963
|
# the function takes in [job_id]
|
3946
|
-
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3964
|
+
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3965
|
+
[str(job_id)])
|
3947
3966
|
returncode, run_timestamps, stderr = self.run_on_head(
|
3948
3967
|
handle,
|
3949
3968
|
code,
|
@@ -3964,13 +3983,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3964
3983
|
job_id = list(run_timestamps.keys())[0]
|
3965
3984
|
local_log_dir = ''
|
3966
3985
|
if controller: # download controller logs
|
3967
|
-
|
3968
|
-
|
3986
|
+
remote_log = os.path.join(
|
3987
|
+
managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
|
3988
|
+
f'{job_id}.log')
|
3969
3989
|
local_log_dir = os.path.expanduser(
|
3970
3990
|
os.path.join(local_dir, run_timestamp))
|
3971
3991
|
|
3972
3992
|
logger.info(f'{colorama.Fore.CYAN}'
|
3973
|
-
f'Job {
|
3993
|
+
f'Job {job_id} local logs: {local_log_dir}'
|
3974
3994
|
f'{colorama.Style.RESET_ALL}')
|
3975
3995
|
|
3976
3996
|
runners = handle.get_command_runners()
|
@@ -3981,12 +4001,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3981
4001
|
Args:
|
3982
4002
|
args: A tuple of (runner, local_log_dir, remote_log_dir)
|
3983
4003
|
"""
|
3984
|
-
(runner, local_log_dir,
|
4004
|
+
(runner, local_log_dir, remote_log) = args
|
3985
4005
|
try:
|
3986
4006
|
os.makedirs(local_log_dir, exist_ok=True)
|
3987
4007
|
runner.rsync(
|
3988
|
-
source=
|
3989
|
-
target=local_log_dir,
|
4008
|
+
source=remote_log,
|
4009
|
+
target=f'{local_log_dir}/controller.log',
|
3990
4010
|
up=False,
|
3991
4011
|
stream_logs=False,
|
3992
4012
|
)
|
@@ -3999,9 +4019,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3999
4019
|
else:
|
4000
4020
|
raise
|
4001
4021
|
|
4002
|
-
parallel_args = [
|
4003
|
-
|
4004
|
-
|
4022
|
+
parallel_args = [
|
4023
|
+
(runner, local_log_dir, remote_log) for runner in runners
|
4024
|
+
]
|
4005
4025
|
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
4006
4026
|
else: # download job logs
|
4007
4027
|
local_log_dir = os.path.expanduser(
|
@@ -4037,43 +4057,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4037
4057
|
f'{colorama.Style.RESET_ALL}')
|
4038
4058
|
return {str(job_id): local_log_dir}
|
4039
4059
|
|
4040
|
-
def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
|
4041
|
-
service_name: str, target: serve_lib.ServiceComponent,
|
4042
|
-
replica_id: Optional[int], follow: bool) -> None:
|
4043
|
-
"""Tail the logs of a service.
|
4044
|
-
|
4045
|
-
Args:
|
4046
|
-
handle: The handle to the sky serve controller.
|
4047
|
-
service_name: The name of the service.
|
4048
|
-
target: The component to tail the logs of. Could be controller,
|
4049
|
-
load balancer, or replica.
|
4050
|
-
replica_id: The replica ID to tail the logs of. Only used when
|
4051
|
-
target is replica.
|
4052
|
-
follow: Whether to follow the logs.
|
4053
|
-
"""
|
4054
|
-
if target != serve_lib.ServiceComponent.REPLICA:
|
4055
|
-
code = serve_lib.ServeCodeGen.stream_serve_process_logs(
|
4056
|
-
service_name,
|
4057
|
-
stream_controller=(
|
4058
|
-
target == serve_lib.ServiceComponent.CONTROLLER),
|
4059
|
-
follow=follow)
|
4060
|
-
else:
|
4061
|
-
assert replica_id is not None, service_name
|
4062
|
-
code = serve_lib.ServeCodeGen.stream_replica_logs(
|
4063
|
-
service_name, replica_id, follow)
|
4064
|
-
|
4065
|
-
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
4066
|
-
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
4067
|
-
|
4068
|
-
self.run_on_head(
|
4069
|
-
handle,
|
4070
|
-
code,
|
4071
|
-
stream_logs=True,
|
4072
|
-
process_stream=False,
|
4073
|
-
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
4074
|
-
stdin=subprocess.DEVNULL,
|
4075
|
-
)
|
4076
|
-
|
4077
4060
|
def teardown_no_lock(self,
|
4078
4061
|
handle: CloudVmRayResourceHandle,
|
4079
4062
|
terminate: bool,
|
sky/check.py
CHANGED
@@ -155,7 +155,8 @@ def check(
|
|
155
155
|
# Pretty print for UX.
|
156
156
|
if not quiet:
|
157
157
|
enabled_clouds_str = '\n :heavy_check_mark: '.join(
|
158
|
-
[''] +
|
158
|
+
[''] +
|
159
|
+
[_format_enabled_cloud(c) for c in sorted(all_enabled_clouds)])
|
159
160
|
rich.print('\n[green]:tada: Enabled clouds :tada:'
|
160
161
|
f'{enabled_clouds_str}[/green]')
|
161
162
|
|
@@ -222,3 +223,32 @@ def get_cloud_credential_file_mounts(
|
|
222
223
|
r2_credential_mounts = cloudflare.get_credential_file_mounts()
|
223
224
|
file_mounts.update(r2_credential_mounts)
|
224
225
|
return file_mounts
|
226
|
+
|
227
|
+
|
228
|
+
def _format_enabled_cloud(cloud_name: str) -> str:
|
229
|
+
if cloud_name == repr(sky_clouds.Kubernetes()):
|
230
|
+
# Get enabled contexts for Kubernetes
|
231
|
+
existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
|
232
|
+
if not existing_contexts:
|
233
|
+
return cloud_name
|
234
|
+
|
235
|
+
# Check if allowed_contexts is explicitly set in config
|
236
|
+
allowed_contexts = skypilot_config.get_nested(
|
237
|
+
('kubernetes', 'allowed_contexts'), None)
|
238
|
+
|
239
|
+
# Format the context info with consistent styling
|
240
|
+
if allowed_contexts is not None:
|
241
|
+
contexts_formatted = []
|
242
|
+
for i, context in enumerate(existing_contexts):
|
243
|
+
# TODO: We should use ux_utils.INDENT_SYMBOL and
|
244
|
+
# INDENT_LAST_SYMBOL but, they are formatted for colorama, while
|
245
|
+
# here we are using rich. We should migrate this file to
|
246
|
+
# use colorama as we do in the rest of the codebase.
|
247
|
+
symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ')
|
248
|
+
contexts_formatted.append(f'\n {symbol}{context}')
|
249
|
+
context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
|
250
|
+
else:
|
251
|
+
context_info = f'Active context: {existing_contexts[0]}'
|
252
|
+
|
253
|
+
return f'{cloud_name}[/green][dim]\n └── {context_info}[/dim][green]'
|
254
|
+
return cloud_name
|
sky/cli.py
CHANGED
@@ -3530,11 +3530,11 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3530
3530
|
if sum([bool(names), all]) != 1:
|
3531
3531
|
raise click.UsageError('Either --all or a name must be specified.')
|
3532
3532
|
if all:
|
3533
|
-
|
3534
|
-
|
3533
|
+
# Use '*' to get all storages.
|
3534
|
+
names = global_user_state.get_glob_storage_name(storage_name='*')
|
3535
|
+
if not names:
|
3535
3536
|
click.echo('No storage(s) to delete.')
|
3536
3537
|
return
|
3537
|
-
names = [s['name'] for s in storages]
|
3538
3538
|
else:
|
3539
3539
|
names = _get_glob_storages(names)
|
3540
3540
|
if names:
|
@@ -3548,7 +3548,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3548
3548
|
abort=True,
|
3549
3549
|
show_default=True)
|
3550
3550
|
|
3551
|
-
|
3551
|
+
def delete_storage(name: str) -> None:
|
3552
|
+
try:
|
3553
|
+
sky.storage_delete(name)
|
3554
|
+
except Exception as e: # pylint: disable=broad-except
|
3555
|
+
click.secho(f'Error deleting storage {name}: {e}', fg='red')
|
3556
|
+
|
3557
|
+
subprocess_utils.run_in_parallel(delete_storage, names)
|
3552
3558
|
|
3553
3559
|
|
3554
3560
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -3588,18 +3594,6 @@ def jobs():
|
|
3588
3594
|
is_flag=True,
|
3589
3595
|
help=('If True, as soon as a job is submitted, return from this call '
|
3590
3596
|
'and do not stream execution logs.'))
|
3591
|
-
@click.option(
|
3592
|
-
'--retry-until-up/--no-retry-until-up',
|
3593
|
-
'-r/-no-r',
|
3594
|
-
default=None,
|
3595
|
-
is_flag=True,
|
3596
|
-
required=False,
|
3597
|
-
help=(
|
3598
|
-
'(Default: True; this flag is deprecated and will be removed in a '
|
3599
|
-
'future release.) Whether to retry provisioning infinitely until the '
|
3600
|
-
'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
|
3601
|
-
'applies to launching all managed jobs (both the initial and '
|
3602
|
-
'any recovery attempts), not the jobs controller.'))
|
3603
3597
|
@click.option('--yes',
|
3604
3598
|
'-y',
|
3605
3599
|
is_flag=True,
|
@@ -3636,7 +3630,6 @@ def jobs_launch(
|
|
3636
3630
|
disk_tier: Optional[str],
|
3637
3631
|
ports: Tuple[str],
|
3638
3632
|
detach_run: bool,
|
3639
|
-
retry_until_up: Optional[bool],
|
3640
3633
|
yes: bool,
|
3641
3634
|
fast: bool,
|
3642
3635
|
):
|
@@ -3680,19 +3673,6 @@ def jobs_launch(
|
|
3680
3673
|
ports=ports,
|
3681
3674
|
job_recovery=job_recovery,
|
3682
3675
|
)
|
3683
|
-
# Deprecation. We set the default behavior to be retry until up, and the
|
3684
|
-
# flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
|
3685
|
-
if retry_until_up is not None:
|
3686
|
-
flag_str = '--retry-until-up'
|
3687
|
-
if not retry_until_up:
|
3688
|
-
flag_str = '--no-retry-until-up'
|
3689
|
-
click.secho(
|
3690
|
-
f'Flag {flag_str} is deprecated and will be removed in a '
|
3691
|
-
'future release (managed jobs will always be retried). '
|
3692
|
-
'Please file an issue if this does not work for you.',
|
3693
|
-
fg='yellow')
|
3694
|
-
else:
|
3695
|
-
retry_until_up = True
|
3696
3676
|
|
3697
3677
|
# Deprecation. The default behavior is fast, and the flag will be removed.
|
3698
3678
|
# The flag was not present in 0.7.x (only nightly), so we will remove before
|
@@ -3742,10 +3722,7 @@ def jobs_launch(
|
|
3742
3722
|
|
3743
3723
|
common_utils.check_cluster_name_is_valid(name)
|
3744
3724
|
|
3745
|
-
managed_jobs.launch(dag,
|
3746
|
-
name,
|
3747
|
-
detach_run=detach_run,
|
3748
|
-
retry_until_up=retry_until_up)
|
3725
|
+
managed_jobs.launch(dag, name, detach_run=detach_run)
|
3749
3726
|
|
3750
3727
|
|
3751
3728
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
sky/clouds/kubernetes.py
CHANGED
@@ -131,7 +131,7 @@ class Kubernetes(clouds.Cloud):
|
|
131
131
|
'Ignoring these contexts.')
|
132
132
|
|
133
133
|
@classmethod
|
134
|
-
def
|
134
|
+
def existing_allowed_contexts(cls) -> List[str]:
|
135
135
|
"""Get existing allowed contexts.
|
136
136
|
|
137
137
|
If None is returned in the list, it means that we are running in a pod
|
@@ -175,7 +175,7 @@ class Kubernetes(clouds.Cloud):
|
|
175
175
|
use_spot: bool, region: Optional[str],
|
176
176
|
zone: Optional[str]) -> List[clouds.Region]:
|
177
177
|
del accelerators, zone, use_spot # unused
|
178
|
-
existing_contexts = cls.
|
178
|
+
existing_contexts = cls.existing_allowed_contexts()
|
179
179
|
|
180
180
|
regions = []
|
181
181
|
for context in existing_contexts:
|
@@ -591,7 +591,7 @@ class Kubernetes(clouds.Cloud):
|
|
591
591
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
592
592
|
# Test using python API
|
593
593
|
try:
|
594
|
-
existing_allowed_contexts = cls.
|
594
|
+
existing_allowed_contexts = cls.existing_allowed_contexts()
|
595
595
|
except ImportError as e:
|
596
596
|
return (False,
|
597
597
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
@@ -115,6 +115,16 @@ def _list_accelerators(
|
|
115
115
|
|
116
116
|
If the user does not have sufficient permissions to list pods in all
|
117
117
|
namespaces, the function will return free GPUs as -1.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
A tuple of three dictionaries:
|
121
|
+
- qtys_map: Dict mapping accelerator names to lists of InstanceTypeInfo
|
122
|
+
objects with quantity information.
|
123
|
+
- total_accelerators_capacity: Dict mapping accelerator names to their
|
124
|
+
total capacity in the cluster.
|
125
|
+
- total_accelerators_available: Dict mapping accelerator names to their
|
126
|
+
current availability. Returns -1 for each accelerator if
|
127
|
+
realtime=False or if insufficient permissions.
|
118
128
|
"""
|
119
129
|
# TODO(romilb): This should be refactored to use get_kubernetes_node_info()
|
120
130
|
# function from kubernetes_utils.
|
@@ -243,6 +253,10 @@ def _list_accelerators(
|
|
243
253
|
|
244
254
|
accelerators_available = accelerator_count - allocated_qty
|
245
255
|
|
256
|
+
# Initialize the entry if it doesn't exist yet
|
257
|
+
if accelerator_name not in total_accelerators_available:
|
258
|
+
total_accelerators_available[accelerator_name] = 0
|
259
|
+
|
246
260
|
if accelerators_available >= min_quantity_filter:
|
247
261
|
quantized_availability = min_quantity_filter * (
|
248
262
|
accelerators_available // min_quantity_filter)
|
sky/core.py
CHANGED
@@ -915,8 +915,11 @@ def storage_delete(name: str) -> None:
|
|
915
915
|
handle = global_user_state.get_handle_from_storage_name(name)
|
916
916
|
if handle is None:
|
917
917
|
raise ValueError(f'Storage name {name!r} not found.')
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
918
|
+
|
919
|
+
assert handle.storage_name == name, (
|
920
|
+
f'In global_user_state, storage name {name!r} does not match '
|
921
|
+
f'handle.storage_name {handle.storage_name!r}')
|
922
|
+
storage_object = data.Storage(name=handle.storage_name,
|
923
|
+
source=handle.source,
|
924
|
+
sync_on_reconstruction=False)
|
925
|
+
storage_object.delete()
|
sky/data/storage.py
CHANGED
@@ -1083,18 +1083,16 @@ class Storage(object):
|
|
1083
1083
|
if not self.stores:
|
1084
1084
|
logger.info('No backing stores found. Deleting storage.')
|
1085
1085
|
global_user_state.remove_storage(self.name)
|
1086
|
-
if store_type:
|
1086
|
+
if store_type is not None:
|
1087
1087
|
store = self.stores[store_type]
|
1088
|
-
is_sky_managed = store.is_sky_managed
|
1089
1088
|
# We delete a store from the cloud if it's sky managed. Else just
|
1090
1089
|
# remove handle and return
|
1091
|
-
if is_sky_managed:
|
1090
|
+
if store.is_sky_managed:
|
1092
1091
|
self.handle.remove_store(store)
|
1093
1092
|
store.delete()
|
1094
1093
|
# Check remaining stores - if none is sky managed, remove
|
1095
1094
|
# the storage from global_user_state.
|
1096
|
-
delete = all(
|
1097
|
-
s.is_sky_managed is False for s in self.stores.values())
|
1095
|
+
delete = all(not s.is_sky_managed for s in self.stores.values())
|
1098
1096
|
if delete:
|
1099
1097
|
global_user_state.remove_storage(self.name)
|
1100
1098
|
else:
|
@@ -1689,6 +1687,9 @@ class S3Store(AbstractStore):
|
|
1689
1687
|
|
1690
1688
|
Returns:
|
1691
1689
|
bool; True if bucket was deleted, False if it was deleted externally.
|
1690
|
+
|
1691
|
+
Raises:
|
1692
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
1692
1693
|
"""
|
1693
1694
|
# Deleting objects is very slow programatically
|
1694
1695
|
# (i.e. bucket.objects.all().delete() is slow).
|
@@ -2179,6 +2180,11 @@ class GcsStore(AbstractStore):
|
|
2179
2180
|
|
2180
2181
|
Returns:
|
2181
2182
|
bool; True if bucket was deleted, False if it was deleted externally.
|
2183
|
+
|
2184
|
+
Raises:
|
2185
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
2186
|
+
PermissionError: If the bucket is external and the user is not
|
2187
|
+
allowed to delete it.
|
2182
2188
|
"""
|
2183
2189
|
if _bucket_sub_path is not None:
|
2184
2190
|
command_suffix = f'/{_bucket_sub_path}'
|
@@ -3478,6 +3484,9 @@ class R2Store(AbstractStore):
|
|
3478
3484
|
|
3479
3485
|
Returns:
|
3480
3486
|
bool; True if bucket was deleted, False if it was deleted externally.
|
3487
|
+
|
3488
|
+
Raises:
|
3489
|
+
StorageBucketDeleteError: If deleting the bucket fails.
|
3481
3490
|
"""
|
3482
3491
|
# Deleting objects is very slow programatically
|
3483
3492
|
# (i.e. bucket.objects.all().delete() is slow).
|
@@ -3932,7 +3941,7 @@ class IBMCosStore(AbstractStore):
|
|
3932
3941
|
|
3933
3942
|
def _delete_cos_bucket_objects(self,
|
3934
3943
|
bucket: Any,
|
3935
|
-
prefix: Optional[str] = None):
|
3944
|
+
prefix: Optional[str] = None) -> None:
|
3936
3945
|
bucket_versioning = self.s3_resource.BucketVersioning(bucket.name)
|
3937
3946
|
if bucket_versioning.status == 'Enabled':
|
3938
3947
|
if prefix is not None:
|
@@ -3947,7 +3956,7 @@ class IBMCosStore(AbstractStore):
|
|
3947
3956
|
res = list(bucket.objects.delete())
|
3948
3957
|
logger.debug(f'Deleted bucket\'s content:\n{res}, prefix: {prefix}')
|
3949
3958
|
|
3950
|
-
def _delete_cos_bucket(self):
|
3959
|
+
def _delete_cos_bucket(self) -> None:
|
3951
3960
|
bucket = self.s3_resource.Bucket(self.name)
|
3952
3961
|
try:
|
3953
3962
|
self._delete_cos_bucket_objects(bucket)
|
@@ -3968,7 +3977,7 @@ class OciStore(AbstractStore):
|
|
3968
3977
|
|
3969
3978
|
def __init__(self,
|
3970
3979
|
name: str,
|
3971
|
-
source:
|
3980
|
+
source: Optional[SourceType],
|
3972
3981
|
region: Optional[str] = None,
|
3973
3982
|
is_sky_managed: Optional[bool] = None,
|
3974
3983
|
sync_on_reconstruction: Optional[bool] = True,
|
@@ -3980,13 +3989,53 @@ class OciStore(AbstractStore):
|
|
3980
3989
|
self.compartment: str
|
3981
3990
|
self.namespace: str
|
3982
3991
|
|
3983
|
-
#
|
3984
|
-
|
3992
|
+
# Region is from the specified name in <bucket>@<region> format.
|
3993
|
+
# Another case is name can also be set by the source, for example:
|
3994
|
+
# /datasets-storage:
|
3995
|
+
# source: oci://RAGData@us-sanjose-1
|
3996
|
+
# The name in above mount will be set to RAGData@us-sanjose-1
|
3997
|
+
region_in_name = None
|
3998
|
+
if name is not None and '@' in name:
|
3999
|
+
self._validate_bucket_expr(name)
|
4000
|
+
name, region_in_name = name.split('@')
|
4001
|
+
|
4002
|
+
# Region is from the specified source in oci://<bucket>@<region> format
|
4003
|
+
region_in_source = None
|
4004
|
+
if isinstance(source,
|
4005
|
+
str) and source.startswith('oci://') and '@' in source:
|
4006
|
+
self._validate_bucket_expr(source)
|
4007
|
+
source, region_in_source = source.split('@')
|
4008
|
+
|
4009
|
+
if region_in_name is not None and region_in_source is not None:
|
4010
|
+
# This should never happen because name and source will never be
|
4011
|
+
# the remote bucket at the same time.
|
4012
|
+
assert region_in_name == region_in_source, (
|
4013
|
+
f'Mismatch region specified. Region in name {region_in_name}, '
|
4014
|
+
f'but region in source is {region_in_source}')
|
4015
|
+
|
4016
|
+
if region_in_name is not None:
|
4017
|
+
region = region_in_name
|
4018
|
+
elif region_in_source is not None:
|
4019
|
+
region = region_in_source
|
4020
|
+
|
4021
|
+
# Default region set to what specified in oci config.
|
4022
|
+
if region is None:
|
4023
|
+
region = oci.get_oci_config()['region']
|
4024
|
+
|
4025
|
+
# So far from now on, the name and source are canonical, means there
|
4026
|
+
# is no region (@<region> suffix) associated with them anymore.
|
3985
4027
|
|
3986
4028
|
super().__init__(name, source, region, is_sky_managed,
|
3987
4029
|
sync_on_reconstruction, _bucket_sub_path)
|
3988
4030
|
# TODO(zpoint): add _bucket_sub_path to the sync/mount/delete commands
|
3989
4031
|
|
4032
|
+
def _validate_bucket_expr(self, bucket_expr: str):
|
4033
|
+
pattern = r'^(\w+://)?[A-Za-z0-9-._]+(@\w{2}-\w+-\d{1})$'
|
4034
|
+
if not re.match(pattern, bucket_expr):
|
4035
|
+
raise ValueError(
|
4036
|
+
'The format for the bucket portion is <bucket>@<region> '
|
4037
|
+
'when specify a region with a bucket.')
|
4038
|
+
|
3990
4039
|
def _validate(self):
|
3991
4040
|
if self.source is not None and isinstance(self.source, str):
|
3992
4041
|
if self.source.startswith('oci://'):
|
@@ -4137,7 +4186,8 @@ class OciStore(AbstractStore):
|
|
4137
4186
|
sync_command = (
|
4138
4187
|
'oci os object bulk-upload --no-follow-symlinks --overwrite '
|
4139
4188
|
f'--bucket-name {self.name} --namespace-name {self.namespace} '
|
4140
|
-
f'--src-dir "{base_dir_path}"
|
4189
|
+
f'--region {self.region} --src-dir "{base_dir_path}" '
|
4190
|
+
f'{includes}')
|
4141
4191
|
|
4142
4192
|
return sync_command
|
4143
4193
|
|
@@ -4157,8 +4207,8 @@ class OciStore(AbstractStore):
|
|
4157
4207
|
sync_command = (
|
4158
4208
|
'oci os object bulk-upload --no-follow-symlinks --overwrite '
|
4159
4209
|
f'--bucket-name {self.name} --namespace-name {self.namespace} '
|
4160
|
-
f'--
|
4161
|
-
f'{excludes}
|
4210
|
+
f'--region {self.region} --object-prefix "{dest_dir_name}" '
|
4211
|
+
f'--src-dir "{src_dir_path}" {excludes}')
|
4162
4212
|
|
4163
4213
|
return sync_command
|
4164
4214
|
|
@@ -4289,7 +4339,8 @@ class OciStore(AbstractStore):
|
|
4289
4339
|
def get_file_download_command(remote_path, local_path):
|
4290
4340
|
download_command = (f'oci os object get --bucket-name {self.name} '
|
4291
4341
|
f'--namespace-name {self.namespace} '
|
4292
|
-
f'--
|
4342
|
+
f'--region {self.region} --name {remote_path} '
|
4343
|
+
f'--file {local_path}')
|
4293
4344
|
|
4294
4345
|
return download_command
|
4295
4346
|
|
@@ -4346,6 +4397,7 @@ class OciStore(AbstractStore):
|
|
4346
4397
|
@oci.with_oci_env
|
4347
4398
|
def get_bucket_delete_command(bucket_name):
|
4348
4399
|
remove_command = (f'oci os bucket delete --bucket-name '
|
4400
|
+
f'--region {self.region} '
|
4349
4401
|
f'{bucket_name} --empty --force')
|
4350
4402
|
|
4351
4403
|
return remove_command
|
sky/global_user_state.py
CHANGED
@@ -827,7 +827,7 @@ def get_storage_names_start_with(starts_with: str) -> List[str]:
|
|
827
827
|
|
828
828
|
|
829
829
|
def get_storage() -> List[Dict[str, Any]]:
|
830
|
-
rows = _DB.cursor.execute('
|
830
|
+
rows = _DB.cursor.execute('SELECT * FROM storage')
|
831
831
|
records = []
|
832
832
|
for name, launched_at, handle, last_use, status in rows:
|
833
833
|
# TODO: use namedtuple instead of dict
|
sky/jobs/constants.py
CHANGED
@@ -2,18 +2,19 @@
|
|
2
2
|
|
3
3
|
JOBS_CONTROLLER_TEMPLATE = 'jobs-controller.yaml.j2'
|
4
4
|
JOBS_CONTROLLER_YAML_PREFIX = '~/.sky/jobs_controller'
|
5
|
+
JOBS_CONTROLLER_LOGS_DIR = '~/sky_logs/jobs_controller'
|
5
6
|
|
6
7
|
JOBS_TASK_YAML_PREFIX = '~/.sky/managed_jobs'
|
7
8
|
|
8
9
|
# Resources as a dict for the jobs controller.
|
9
|
-
# Use
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
10
|
+
# Use smaller CPU instance type for jobs controller, but with more memory, i.e.
|
11
|
+
# r6i.xlarge (4vCPUs, 32 GB) for AWS, Standard_E4s_v5 (4vCPUs, 32 GB) for Azure,
|
12
|
+
# and n2-highmem-4 (4 vCPUs, 32 GB) for GCP, etc.
|
13
|
+
# Concurrently limits are set based on profiling. 4x num vCPUs is the launch
|
14
|
+
# parallelism limit, and memory / 350MB is the limit to concurrently running
|
15
|
+
# jobs. See _get_launch_parallelism and _get_job_parallelism in scheduler.py.
|
15
16
|
# We use 50 GB disk size to reduce the cost.
|
16
|
-
CONTROLLER_RESOURCES = {'cpus': '
|
17
|
+
CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8x', 'disk_size': 50}
|
17
18
|
|
18
19
|
# Max length of the cluster name for GCP is 35, the user hash to be attached is
|
19
20
|
# 4+1 chars, and we assume the maximum length of the job id is 4+1, so the max
|