skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/common.py +15 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/oci.py +32 -1
- sky/authentication.py +20 -8
- sky/backends/backend_utils.py +44 -0
- sky/backends/cloud_vm_ray_backend.py +202 -41
- sky/backends/wheel_utils.py +4 -1
- sky/check.py +31 -1
- sky/cli.py +39 -43
- sky/cloud_stores.py +71 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +137 -50
- sky/clouds/cloud.py +4 -0
- sky/clouds/do.py +303 -0
- sky/clouds/gcp.py +9 -0
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/oci.py +20 -9
- sky/clouds/service_catalog/__init__.py +7 -3
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/clouds/utils/oci_utils.py +15 -2
- sky/core.py +8 -5
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +19 -4
- sky/data/mounting_utils.py +99 -15
- sky/data/storage.py +961 -130
- sky/global_user_state.py +1 -1
- sky/jobs/__init__.py +2 -0
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +46 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +290 -21
- sky/jobs/utils.py +346 -95
- sky/optimizer.py +6 -3
- sky/provision/aws/config.py +59 -29
- sky/provision/azure/instance.py +1 -1
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +306 -0
- sky/provision/docker_utils.py +22 -11
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +3 -2
- sky/provision/kubernetes/utils.py +125 -20
- sky/provision/oci/query_utils.py +17 -14
- sky/provision/provisioner.py +0 -1
- sky/provision/runpod/instance.py +10 -1
- sky/provision/runpod/utils.py +170 -13
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/setup_files/dependencies.py +4 -1
- sky/skylet/constants.py +8 -4
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/providers/command_runner.py +5 -7
- sky/skylet/skylet.py +1 -1
- sky/task.py +28 -1
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/jobs-controller.yaml.j2 +41 -7
- sky/templates/runpod-ray.yml.j2 +13 -0
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/usage/usage_lib.py +10 -2
- sky/utils/accelerator_registry.py +12 -8
- sky/utils/controller_utils.py +114 -39
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/log_utils.py +2 -0
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +27 -0
- sky/utils/subprocess_utils.py +54 -10
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '1c94d0f001ed6519873a59a7b46681d64dd696d2'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250124'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/common.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Lazy import for modules to avoid import error when not used."""
|
2
2
|
import functools
|
3
3
|
import importlib
|
4
|
+
import threading
|
4
5
|
from typing import Any, Callable, Optional, Tuple
|
5
6
|
|
6
7
|
|
@@ -24,17 +25,22 @@ class LazyImport:
|
|
24
25
|
self._module = None
|
25
26
|
self._import_error_message = import_error_message
|
26
27
|
self._set_loggers = set_loggers
|
28
|
+
self._lock = threading.RLock()
|
27
29
|
|
28
30
|
def load_module(self):
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
# Avoid extra imports when multiple threads try to import the same
|
32
|
+
# module. The overhead is minor since import can only run in serial
|
33
|
+
# due to GIL even in multi-threaded environments.
|
34
|
+
with self._lock:
|
35
|
+
if self._module is None:
|
36
|
+
try:
|
37
|
+
self._module = importlib.import_module(self._module_name)
|
38
|
+
if self._set_loggers is not None:
|
39
|
+
self._set_loggers()
|
40
|
+
except ImportError as e:
|
41
|
+
if self._import_error_message is not None:
|
42
|
+
raise ImportError(self._import_error_message) from e
|
43
|
+
raise
|
38
44
|
return self._module
|
39
45
|
|
40
46
|
def __getattr__(self, name: str) -> Any:
|
sky/adaptors/do.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
"""Digital Ocean cloud adaptors"""
|
2
|
+
|
3
|
+
# pylint: disable=import-outside-toplevel
|
4
|
+
|
5
|
+
from sky.adaptors import common
|
6
|
+
|
7
|
+
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for DO. '
|
8
|
+
'Try pip install "skypilot[do]"')
|
9
|
+
pydo = common.LazyImport('pydo', import_error_message=_IMPORT_ERROR_MESSAGE)
|
10
|
+
azure = common.LazyImport('azure', import_error_message=_IMPORT_ERROR_MESSAGE)
|
11
|
+
_LAZY_MODULES = (pydo, azure)
|
12
|
+
|
13
|
+
|
14
|
+
# `pydo`` inherits Azure exceptions. See:
|
15
|
+
# https://github.com/digitalocean/pydo/blob/7b01498d99eb0d3a772366b642e5fab3d6fc6aa2/examples/poc_droplets_volumes_sshkeys.py#L6
|
16
|
+
@common.load_lazy_modules(modules=_LAZY_MODULES)
|
17
|
+
def exceptions():
|
18
|
+
"""Azure exceptions."""
|
19
|
+
from azure.core import exceptions as azure_exceptions
|
20
|
+
return azure_exceptions
|
sky/adaptors/oci.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
"""Oracle OCI cloud adaptor"""
|
2
2
|
|
3
|
+
import functools
|
3
4
|
import logging
|
4
5
|
import os
|
5
6
|
|
6
7
|
from sky.adaptors import common
|
8
|
+
from sky.clouds.utils import oci_utils
|
7
9
|
|
8
10
|
# Suppress OCI circuit breaker logging before lazy import, because
|
9
11
|
# oci modules prints additional message during imports, i.e., the
|
@@ -30,10 +32,16 @@ def get_config_file() -> str:
|
|
30
32
|
|
31
33
|
def get_oci_config(region=None, profile='DEFAULT'):
|
32
34
|
conf_file_path = get_config_file()
|
35
|
+
if not profile or profile == 'DEFAULT':
|
36
|
+
config_profile = oci_utils.oci_config.get_profile()
|
37
|
+
else:
|
38
|
+
config_profile = profile
|
39
|
+
|
33
40
|
oci_config = oci.config.from_file(file_location=conf_file_path,
|
34
|
-
profile_name=
|
41
|
+
profile_name=config_profile)
|
35
42
|
if region is not None:
|
36
43
|
oci_config['region'] = region
|
44
|
+
|
37
45
|
return oci_config
|
38
46
|
|
39
47
|
|
@@ -54,6 +62,29 @@ def get_identity_client(region=None, profile='DEFAULT'):
|
|
54
62
|
return oci.identity.IdentityClient(get_oci_config(region, profile))
|
55
63
|
|
56
64
|
|
65
|
+
def get_object_storage_client(region=None, profile='DEFAULT'):
|
66
|
+
return oci.object_storage.ObjectStorageClient(
|
67
|
+
get_oci_config(region, profile))
|
68
|
+
|
69
|
+
|
57
70
|
def service_exception():
|
58
71
|
"""OCI service exception."""
|
59
72
|
return oci.exceptions.ServiceError
|
73
|
+
|
74
|
+
|
75
|
+
def with_oci_env(f):
|
76
|
+
|
77
|
+
@functools.wraps(f)
|
78
|
+
def wrapper(*args, **kwargs):
|
79
|
+
# pylint: disable=line-too-long
|
80
|
+
enter_env_cmds = [
|
81
|
+
'conda info --envs | grep "sky-oci-cli-env" || conda create -n sky-oci-cli-env python=3.10 -y',
|
82
|
+
'. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true',
|
83
|
+
'conda activate sky-oci-cli-env', 'pip install oci-cli',
|
84
|
+
'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True'
|
85
|
+
]
|
86
|
+
operation_cmd = [f(*args, **kwargs)]
|
87
|
+
leave_env_cmds = ['conda deactivate']
|
88
|
+
return ' && '.join(enter_env_cmds + operation_cmd + leave_env_cmds)
|
89
|
+
|
90
|
+
return wrapper
|
sky/authentication.py
CHANGED
@@ -408,14 +408,26 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
408
408
|
secret = k8s.client.V1Secret(
|
409
409
|
metadata=k8s.client.V1ObjectMeta(**secret_metadata),
|
410
410
|
string_data={secret_field_name: public_key})
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
secret_name
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
411
|
+
try:
|
412
|
+
if kubernetes_utils.check_secret_exists(secret_name, namespace,
|
413
|
+
context):
|
414
|
+
logger.debug(f'Key {secret_name} exists in the cluster, '
|
415
|
+
'patching it...')
|
416
|
+
kubernetes.core_api(context).patch_namespaced_secret(
|
417
|
+
secret_name, namespace, secret)
|
418
|
+
else:
|
419
|
+
logger.debug(f'Key {secret_name} does not exist in the cluster, '
|
420
|
+
'creating it...')
|
421
|
+
kubernetes.core_api(context).create_namespaced_secret(
|
422
|
+
namespace, secret)
|
423
|
+
except kubernetes.api_exception() as e:
|
424
|
+
if e.status == 409 and e.reason == 'AlreadyExists':
|
425
|
+
logger.debug(f'Key {secret_name} was created concurrently, '
|
426
|
+
'patching it...')
|
427
|
+
kubernetes.core_api(context).patch_namespaced_secret(
|
428
|
+
secret_name, namespace, secret)
|
429
|
+
else:
|
430
|
+
raise e
|
419
431
|
|
420
432
|
private_key_path, _ = get_or_generate_keys()
|
421
433
|
if network_mode == nodeport_mode:
|
sky/backends/backend_utils.py
CHANGED
@@ -650,6 +650,42 @@ def _replace_yaml_dicts(
|
|
650
650
|
return common_utils.dump_yaml_str(new_config)
|
651
651
|
|
652
652
|
|
653
|
+
def get_expirable_clouds(
|
654
|
+
enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]:
|
655
|
+
"""Returns a list of clouds that use local credentials and whose credentials can expire.
|
656
|
+
|
657
|
+
This function checks each cloud in the provided sequence to determine if it uses local credentials
|
658
|
+
and if its credentials can expire. If both conditions are met, the cloud is added to the list of
|
659
|
+
expirable clouds.
|
660
|
+
|
661
|
+
Args:
|
662
|
+
enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check.
|
663
|
+
|
664
|
+
Returns:
|
665
|
+
list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire.
|
666
|
+
"""
|
667
|
+
expirable_clouds = []
|
668
|
+
local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
|
669
|
+
for cloud in enabled_clouds:
|
670
|
+
remote_identities = skypilot_config.get_nested(
|
671
|
+
(str(cloud).lower(), 'remote_identity'), None)
|
672
|
+
if remote_identities is None:
|
673
|
+
remote_identities = schemas.get_default_remote_identity(
|
674
|
+
str(cloud).lower())
|
675
|
+
|
676
|
+
local_credential_expiring = cloud.can_credential_expire()
|
677
|
+
if isinstance(remote_identities, str):
|
678
|
+
if remote_identities == local_credentials_value and local_credential_expiring:
|
679
|
+
expirable_clouds.append(cloud)
|
680
|
+
elif isinstance(remote_identities, list):
|
681
|
+
for profile in remote_identities:
|
682
|
+
if list(profile.values(
|
683
|
+
))[0] == local_credentials_value and local_credential_expiring:
|
684
|
+
expirable_clouds.append(cloud)
|
685
|
+
break
|
686
|
+
return expirable_clouds
|
687
|
+
|
688
|
+
|
653
689
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
654
690
|
@timeline.event
|
655
691
|
def write_cluster_config(
|
@@ -926,6 +962,13 @@ def write_cluster_config(
|
|
926
962
|
tmp_yaml_path,
|
927
963
|
cluster_config_overrides=to_provision.cluster_config_overrides)
|
928
964
|
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
|
965
|
+
yaml_obj = common_utils.read_yaml(tmp_yaml_path)
|
966
|
+
pod_config = yaml_obj['available_node_types']['ray_head_default'][
|
967
|
+
'node_config']
|
968
|
+
valid, message = kubernetes_utils.check_pod_config(pod_config)
|
969
|
+
if not valid:
|
970
|
+
raise exceptions.InvalidCloudConfigs(
|
971
|
+
f'Invalid pod_config. Details: {message}')
|
929
972
|
|
930
973
|
if dryrun:
|
931
974
|
# If dryrun, return the unfinished tmp yaml path.
|
@@ -1000,6 +1043,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
1000
1043
|
clouds.Cudo,
|
1001
1044
|
clouds.Paperspace,
|
1002
1045
|
clouds.Azure,
|
1046
|
+
clouds.DO,
|
1003
1047
|
)):
|
1004
1048
|
config = auth.configure_ssh_info(config)
|
1005
1049
|
elif isinstance(cloud, clouds.GCP):
|
@@ -10,6 +10,7 @@ import os
|
|
10
10
|
import pathlib
|
11
11
|
import re
|
12
12
|
import shlex
|
13
|
+
import shutil
|
13
14
|
import signal
|
14
15
|
import subprocess
|
15
16
|
import sys
|
@@ -26,6 +27,7 @@ import filelock
|
|
26
27
|
|
27
28
|
import sky
|
28
29
|
from sky import backends
|
30
|
+
from sky import check as sky_check
|
29
31
|
from sky import cloud_stores
|
30
32
|
from sky import clouds
|
31
33
|
from sky import exceptions
|
@@ -34,7 +36,6 @@ from sky import jobs as managed_jobs
|
|
34
36
|
from sky import optimizer
|
35
37
|
from sky import provision as provision_lib
|
36
38
|
from sky import resources as resources_lib
|
37
|
-
from sky import serve as serve_lib
|
38
39
|
from sky import sky_logging
|
39
40
|
from sky import status_lib
|
40
41
|
from sky import task as task_lib
|
@@ -44,6 +45,7 @@ from sky.clouds import service_catalog
|
|
44
45
|
from sky.clouds.utils import gcp_utils
|
45
46
|
from sky.data import data_utils
|
46
47
|
from sky.data import storage as storage_lib
|
48
|
+
from sky.jobs import constants as managed_jobs_constants
|
47
49
|
from sky.provision import common as provision_common
|
48
50
|
from sky.provision import instance_setup
|
49
51
|
from sky.provision import metadata_utils
|
@@ -154,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
|
|
154
156
|
# might be added during ssh.
|
155
157
|
_MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
|
156
158
|
|
159
|
+
_RESOURCES_UNAVAILABLE_LOG = (
|
160
|
+
'Reasons for provision failures (for details, please check the log above):')
|
161
|
+
|
157
162
|
|
158
163
|
def _is_command_length_over_limit(command: str) -> bool:
|
159
164
|
"""Check if the length of the command exceeds the limit.
|
@@ -178,6 +183,7 @@ def _get_cluster_config_template(cloud):
|
|
178
183
|
clouds.SCP: 'scp-ray.yml.j2',
|
179
184
|
clouds.OCI: 'oci-ray.yml.j2',
|
180
185
|
clouds.Paperspace: 'paperspace-ray.yml.j2',
|
186
|
+
clouds.DO: 'do-ray.yml.j2',
|
181
187
|
clouds.RunPod: 'runpod-ray.yml.j2',
|
182
188
|
clouds.Kubernetes: 'kubernetes-ray.yml.j2',
|
183
189
|
clouds.Vsphere: 'vsphere-ray.yml.j2',
|
@@ -1995,6 +2001,23 @@ class RetryingVmProvisioner(object):
|
|
1995
2001
|
skip_unnecessary_provisioning else None)
|
1996
2002
|
|
1997
2003
|
failover_history: List[Exception] = list()
|
2004
|
+
resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
|
2005
|
+
# If the user is using local credentials which may expire, the
|
2006
|
+
# controller may leak resources if the credentials expire while a job
|
2007
|
+
# is running. Here we check the enabled clouds and expiring credentials
|
2008
|
+
# and raise a warning to the user.
|
2009
|
+
if task.is_controller_task():
|
2010
|
+
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
|
2011
|
+
expirable_clouds = backend_utils.get_expirable_clouds(
|
2012
|
+
enabled_clouds)
|
2013
|
+
|
2014
|
+
if len(expirable_clouds) > 0:
|
2015
|
+
warnings = (f'\033[93mWarning: Credentials used for '
|
2016
|
+
f'{expirable_clouds} may expire. Clusters may be '
|
2017
|
+
f'leaked if the credentials expire while jobs '
|
2018
|
+
f'are running. It is recommended to use credentials'
|
2019
|
+
f' that never expire or a service account.\033[0m')
|
2020
|
+
logger.warning(warnings)
|
1998
2021
|
|
1999
2022
|
# Retrying launchable resources.
|
2000
2023
|
while True:
|
@@ -2070,6 +2093,8 @@ class RetryingVmProvisioner(object):
|
|
2070
2093
|
# Add failed resources to the blocklist, only when it
|
2071
2094
|
# is in fallback mode.
|
2072
2095
|
_add_to_blocked_resources(self._blocked_resources, to_provision)
|
2096
|
+
assert len(failover_history) > 0
|
2097
|
+
resource_exceptions[to_provision] = failover_history[-1]
|
2073
2098
|
else:
|
2074
2099
|
# If we reach here, it means that the existing cluster must have
|
2075
2100
|
# a previous status of INIT, because other statuses (UP,
|
@@ -2114,7 +2139,14 @@ class RetryingVmProvisioner(object):
|
|
2114
2139
|
# possible resources or the requested resources is too
|
2115
2140
|
# restrictive. If we reach here, our failover logic finally
|
2116
2141
|
# ends here.
|
2117
|
-
|
2142
|
+
table = log_utils.create_table(['Resource', 'Reason'])
|
2143
|
+
for (resource, exception) in resource_exceptions.items():
|
2144
|
+
table.add_row(
|
2145
|
+
[resources_utils.format_resource(resource), exception])
|
2146
|
+
table.max_table_width = shutil.get_terminal_size().columns
|
2147
|
+
raise exceptions.ResourcesUnavailableError(
|
2148
|
+
_RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
|
2149
|
+
failover_history=failover_history)
|
2118
2150
|
to_provision = task.best_resources
|
2119
2151
|
assert task in self._dag.tasks, 'Internal logic error.'
|
2120
2152
|
assert to_provision is not None, task
|
@@ -2877,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2877
2909
|
'the `--retry-until-up` flag.')
|
2878
2910
|
with ux_utils.print_exception_no_traceback():
|
2879
2911
|
raise exceptions.ResourcesUnavailableError(
|
2880
|
-
error_message,
|
2912
|
+
error_message + '\n' + str(e),
|
2881
2913
|
failover_history=e.failover_history) from None
|
2882
2914
|
if dryrun:
|
2883
2915
|
record = global_user_state.get_cluster_from_name(cluster_name)
|
@@ -3309,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3309
3341
|
# even if some of them raise exceptions. We should replace it with
|
3310
3342
|
# multi-process.
|
3311
3343
|
rich_utils.stop_safe_status()
|
3312
|
-
subprocess_utils.run_in_parallel(_setup_node, range(num_nodes))
|
3344
|
+
subprocess_utils.run_in_parallel(_setup_node, list(range(num_nodes)))
|
3313
3345
|
|
3314
3346
|
if detach_setup:
|
3315
3347
|
# Only set this when setup needs to be run outside the self._setup()
|
@@ -3873,42 +3905,157 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3873
3905
|
stdin=subprocess.DEVNULL,
|
3874
3906
|
)
|
3875
3907
|
|
3876
|
-
def
|
3877
|
-
|
3878
|
-
|
3879
|
-
|
3908
|
+
def sync_down_managed_job_logs(
|
3909
|
+
self,
|
3910
|
+
handle: CloudVmRayResourceHandle,
|
3911
|
+
job_id: Optional[int] = None,
|
3912
|
+
job_name: Optional[str] = None,
|
3913
|
+
controller: bool = False,
|
3914
|
+
local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
|
3915
|
+
"""Sync down logs for a managed job.
|
3880
3916
|
|
3881
3917
|
Args:
|
3882
|
-
handle: The handle to the
|
3883
|
-
|
3884
|
-
|
3885
|
-
|
3886
|
-
|
3887
|
-
target is replica.
|
3888
|
-
follow: Whether to follow the logs.
|
3889
|
-
"""
|
3890
|
-
if target != serve_lib.ServiceComponent.REPLICA:
|
3891
|
-
code = serve_lib.ServeCodeGen.stream_serve_process_logs(
|
3892
|
-
service_name,
|
3893
|
-
stream_controller=(
|
3894
|
-
target == serve_lib.ServiceComponent.CONTROLLER),
|
3895
|
-
follow=follow)
|
3896
|
-
else:
|
3897
|
-
assert replica_id is not None, service_name
|
3898
|
-
code = serve_lib.ServeCodeGen.stream_replica_logs(
|
3899
|
-
service_name, replica_id, follow)
|
3918
|
+
handle: The handle to the cluster.
|
3919
|
+
job_id: The job ID to sync down logs for.
|
3920
|
+
job_name: The job name to sync down logs for.
|
3921
|
+
controller: Whether to sync down logs for the controller.
|
3922
|
+
local_dir: The local directory to sync down logs to.
|
3900
3923
|
|
3901
|
-
|
3902
|
-
|
3924
|
+
Returns:
|
3925
|
+
A dictionary mapping job_id to log path.
|
3926
|
+
"""
|
3927
|
+
# if job_name and job_id should not both be specified
|
3928
|
+
assert job_name is None or job_id is None, (job_name, job_id)
|
3903
3929
|
|
3904
|
-
|
3930
|
+
if job_id is None:
|
3931
|
+
# generate code to get the job_id
|
3932
|
+
# if job_name is None, get all job_ids
|
3933
|
+
# TODO: Only get the latest job_id, since that's the only one we use
|
3934
|
+
code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
|
3935
|
+
job_name=job_name)
|
3936
|
+
returncode, job_ids, stderr = self.run_on_head(handle,
|
3937
|
+
code,
|
3938
|
+
stream_logs=False,
|
3939
|
+
require_outputs=True,
|
3940
|
+
separate_stderr=True)
|
3941
|
+
subprocess_utils.handle_returncode(returncode, code,
|
3942
|
+
'Failed to sync down logs.',
|
3943
|
+
stderr)
|
3944
|
+
job_ids = common_utils.decode_payload(job_ids)
|
3945
|
+
if not job_ids:
|
3946
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3947
|
+
'No matching job found'
|
3948
|
+
f'{colorama.Style.RESET_ALL}')
|
3949
|
+
return {}
|
3950
|
+
elif len(job_ids) > 1:
|
3951
|
+
name_str = ''
|
3952
|
+
if job_name is not None:
|
3953
|
+
name_str = ('Multiple jobs IDs found under the name '
|
3954
|
+
f'{job_name}. ')
|
3955
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3956
|
+
f'{name_str}'
|
3957
|
+
'Downloading the latest job logs.'
|
3958
|
+
f'{colorama.Style.RESET_ALL}')
|
3959
|
+
# list should aready be in descending order
|
3960
|
+
job_id = job_ids[0]
|
3961
|
+
|
3962
|
+
# get the run_timestamp
|
3963
|
+
# the function takes in [job_id]
|
3964
|
+
code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
|
3965
|
+
[str(job_id)])
|
3966
|
+
returncode, run_timestamps, stderr = self.run_on_head(
|
3905
3967
|
handle,
|
3906
3968
|
code,
|
3907
|
-
stream_logs=
|
3908
|
-
|
3909
|
-
|
3910
|
-
|
3911
|
-
|
3969
|
+
stream_logs=False,
|
3970
|
+
require_outputs=True,
|
3971
|
+
separate_stderr=True)
|
3972
|
+
subprocess_utils.handle_returncode(returncode, code,
|
3973
|
+
'Failed to sync logs.', stderr)
|
3974
|
+
# returns with a dict of {job_id: run_timestamp}
|
3975
|
+
run_timestamps = common_utils.decode_payload(run_timestamps)
|
3976
|
+
if not run_timestamps:
|
3977
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
3978
|
+
'No matching log directories found'
|
3979
|
+
f'{colorama.Style.RESET_ALL}')
|
3980
|
+
return {}
|
3981
|
+
|
3982
|
+
run_timestamp = list(run_timestamps.values())[0]
|
3983
|
+
job_id = list(run_timestamps.keys())[0]
|
3984
|
+
local_log_dir = ''
|
3985
|
+
if controller: # download controller logs
|
3986
|
+
remote_log = os.path.join(
|
3987
|
+
managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
|
3988
|
+
f'{job_id}.log')
|
3989
|
+
local_log_dir = os.path.expanduser(
|
3990
|
+
os.path.join(local_dir, run_timestamp))
|
3991
|
+
|
3992
|
+
logger.info(f'{colorama.Fore.CYAN}'
|
3993
|
+
f'Job {job_id} local logs: {local_log_dir}'
|
3994
|
+
f'{colorama.Style.RESET_ALL}')
|
3995
|
+
|
3996
|
+
runners = handle.get_command_runners()
|
3997
|
+
|
3998
|
+
def _rsync_down(args) -> None:
|
3999
|
+
"""Rsync down logs from remote nodes.
|
4000
|
+
|
4001
|
+
Args:
|
4002
|
+
args: A tuple of (runner, local_log_dir, remote_log_dir)
|
4003
|
+
"""
|
4004
|
+
(runner, local_log_dir, remote_log) = args
|
4005
|
+
try:
|
4006
|
+
os.makedirs(local_log_dir, exist_ok=True)
|
4007
|
+
runner.rsync(
|
4008
|
+
source=remote_log,
|
4009
|
+
target=f'{local_log_dir}/controller.log',
|
4010
|
+
up=False,
|
4011
|
+
stream_logs=False,
|
4012
|
+
)
|
4013
|
+
except exceptions.CommandError as e:
|
4014
|
+
if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
|
4015
|
+
# Raised by rsync_down. Remote log dir may not exist
|
4016
|
+
# since the job can be run on some part of the nodes.
|
4017
|
+
logger.debug(
|
4018
|
+
f'{runner.node_id} does not have the tasks/*.')
|
4019
|
+
else:
|
4020
|
+
raise
|
4021
|
+
|
4022
|
+
parallel_args = [
|
4023
|
+
(runner, local_log_dir, remote_log) for runner in runners
|
4024
|
+
]
|
4025
|
+
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
4026
|
+
else: # download job logs
|
4027
|
+
local_log_dir = os.path.expanduser(
|
4028
|
+
os.path.join(local_dir, 'managed_jobs', run_timestamp))
|
4029
|
+
os.makedirs(os.path.dirname(local_log_dir), exist_ok=True)
|
4030
|
+
log_file = os.path.join(local_log_dir, 'run.log')
|
4031
|
+
|
4032
|
+
code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
|
4033
|
+
job_id=job_id,
|
4034
|
+
follow=False,
|
4035
|
+
controller=False)
|
4036
|
+
|
4037
|
+
# With the stdin=subprocess.DEVNULL, the ctrl-c will not
|
4038
|
+
# kill the process, so we need to handle it manually here.
|
4039
|
+
if threading.current_thread() is threading.main_thread():
|
4040
|
+
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
4041
|
+
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
4042
|
+
|
4043
|
+
# We redirect the output to the log file
|
4044
|
+
# and disable the STDOUT and STDERR
|
4045
|
+
self.run_on_head(
|
4046
|
+
handle,
|
4047
|
+
code,
|
4048
|
+
log_path=log_file,
|
4049
|
+
stream_logs=False,
|
4050
|
+
process_stream=False,
|
4051
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
4052
|
+
stdin=subprocess.DEVNULL,
|
4053
|
+
)
|
4054
|
+
|
4055
|
+
logger.info(f'{colorama.Fore.CYAN}'
|
4056
|
+
f'Job {job_id} logs: {local_log_dir}'
|
4057
|
+
f'{colorama.Style.RESET_ALL}')
|
4058
|
+
return {str(job_id): local_log_dir}
|
3912
4059
|
|
3913
4060
|
def teardown_no_lock(self,
|
3914
4061
|
handle: CloudVmRayResourceHandle,
|
@@ -4198,11 +4345,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4198
4345
|
attempts = 0
|
4199
4346
|
while True:
|
4200
4347
|
logger.debug(f'instance statuses attempt {attempts + 1}')
|
4201
|
-
|
4202
|
-
|
4203
|
-
|
4204
|
-
|
4205
|
-
|
4348
|
+
try:
|
4349
|
+
node_status_dict = provision_lib.query_instances(
|
4350
|
+
repr(cloud),
|
4351
|
+
cluster_name_on_cloud,
|
4352
|
+
config['provider'],
|
4353
|
+
non_terminated_only=False)
|
4354
|
+
except Exception as e: # pylint: disable=broad-except
|
4355
|
+
if purge:
|
4356
|
+
logger.warning(
|
4357
|
+
f'Failed to query instances. Skipping since purge is '
|
4358
|
+
f'set. Details: '
|
4359
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
4360
|
+
break
|
4361
|
+
raise
|
4206
4362
|
|
4207
4363
|
unexpected_node_state: Optional[Tuple[str, str]] = None
|
4208
4364
|
for node_id, node_status in node_status_dict.items():
|
@@ -4221,8 +4377,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4221
4377
|
time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
|
4222
4378
|
else:
|
4223
4379
|
(node_id, node_status) = unexpected_node_state
|
4224
|
-
|
4225
|
-
|
4380
|
+
if purge:
|
4381
|
+
logger.warning(f'Instance {node_id} in unexpected '
|
4382
|
+
f'state {node_status}. Skipping since purge '
|
4383
|
+
'is set.')
|
4384
|
+
break
|
4385
|
+
raise RuntimeError(f'Instance {node_id} in unexpected '
|
4386
|
+
f'state {node_status}.')
|
4226
4387
|
|
4227
4388
|
global_user_state.remove_cluster(handle.cluster_name,
|
4228
4389
|
terminate=terminate)
|
sky/backends/wheel_utils.py
CHANGED
@@ -153,7 +153,10 @@ def build_sky_wheel() -> Tuple[pathlib.Path, str]:
|
|
153
153
|
if not path.exists():
|
154
154
|
return -1.
|
155
155
|
try:
|
156
|
-
return max(
|
156
|
+
return max(
|
157
|
+
os.path.getmtime(os.path.join(root, f))
|
158
|
+
for root, dirs, files in os.walk(path)
|
159
|
+
for f in (*dirs, *files))
|
157
160
|
except ValueError:
|
158
161
|
return -1.
|
159
162
|
|
sky/check.py
CHANGED
@@ -155,7 +155,8 @@ def check(
|
|
155
155
|
# Pretty print for UX.
|
156
156
|
if not quiet:
|
157
157
|
enabled_clouds_str = '\n :heavy_check_mark: '.join(
|
158
|
-
[''] +
|
158
|
+
[''] +
|
159
|
+
[_format_enabled_cloud(c) for c in sorted(all_enabled_clouds)])
|
159
160
|
rich.print('\n[green]:tada: Enabled clouds :tada:'
|
160
161
|
f'{enabled_clouds_str}[/green]')
|
161
162
|
|
@@ -222,3 +223,32 @@ def get_cloud_credential_file_mounts(
|
|
222
223
|
r2_credential_mounts = cloudflare.get_credential_file_mounts()
|
223
224
|
file_mounts.update(r2_credential_mounts)
|
224
225
|
return file_mounts
|
226
|
+
|
227
|
+
|
228
|
+
def _format_enabled_cloud(cloud_name: str) -> str:
|
229
|
+
if cloud_name == repr(sky_clouds.Kubernetes()):
|
230
|
+
# Get enabled contexts for Kubernetes
|
231
|
+
existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
|
232
|
+
if not existing_contexts:
|
233
|
+
return cloud_name
|
234
|
+
|
235
|
+
# Check if allowed_contexts is explicitly set in config
|
236
|
+
allowed_contexts = skypilot_config.get_nested(
|
237
|
+
('kubernetes', 'allowed_contexts'), None)
|
238
|
+
|
239
|
+
# Format the context info with consistent styling
|
240
|
+
if allowed_contexts is not None:
|
241
|
+
contexts_formatted = []
|
242
|
+
for i, context in enumerate(existing_contexts):
|
243
|
+
# TODO: We should use ux_utils.INDENT_SYMBOL and
|
244
|
+
# INDENT_LAST_SYMBOL but, they are formatted for colorama, while
|
245
|
+
# here we are using rich. We should migrate this file to
|
246
|
+
# use colorama as we do in the rest of the codebase.
|
247
|
+
symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ')
|
248
|
+
contexts_formatted.append(f'\n {symbol}{context}')
|
249
|
+
context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
|
250
|
+
else:
|
251
|
+
context_info = f'Active context: {existing_contexts[0]}'
|
252
|
+
|
253
|
+
return f'{cloud_name}[/green][dim]\n └── {context_info}[/dim][green]'
|
254
|
+
return cloud_name
|