skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/common.py +15 -9
  3. sky/adaptors/do.py +20 -0
  4. sky/adaptors/oci.py +32 -1
  5. sky/authentication.py +20 -8
  6. sky/backends/backend_utils.py +44 -0
  7. sky/backends/cloud_vm_ray_backend.py +202 -41
  8. sky/backends/wheel_utils.py +4 -1
  9. sky/check.py +31 -1
  10. sky/cli.py +39 -43
  11. sky/cloud_stores.py +71 -2
  12. sky/clouds/__init__.py +2 -0
  13. sky/clouds/aws.py +137 -50
  14. sky/clouds/cloud.py +4 -0
  15. sky/clouds/do.py +303 -0
  16. sky/clouds/gcp.py +9 -0
  17. sky/clouds/kubernetes.py +3 -3
  18. sky/clouds/oci.py +20 -9
  19. sky/clouds/service_catalog/__init__.py +7 -3
  20. sky/clouds/service_catalog/constants.py +1 -1
  21. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
  22. sky/clouds/service_catalog/do_catalog.py +111 -0
  23. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  24. sky/clouds/utils/oci_utils.py +15 -2
  25. sky/core.py +8 -5
  26. sky/data/data_transfer.py +37 -0
  27. sky/data/data_utils.py +19 -4
  28. sky/data/mounting_utils.py +99 -15
  29. sky/data/storage.py +961 -130
  30. sky/global_user_state.py +1 -1
  31. sky/jobs/__init__.py +2 -0
  32. sky/jobs/constants.py +8 -7
  33. sky/jobs/controller.py +19 -22
  34. sky/jobs/core.py +46 -2
  35. sky/jobs/recovery_strategy.py +114 -143
  36. sky/jobs/scheduler.py +283 -0
  37. sky/jobs/state.py +290 -21
  38. sky/jobs/utils.py +346 -95
  39. sky/optimizer.py +6 -3
  40. sky/provision/aws/config.py +59 -29
  41. sky/provision/azure/instance.py +1 -1
  42. sky/provision/do/__init__.py +11 -0
  43. sky/provision/do/config.py +14 -0
  44. sky/provision/do/constants.py +10 -0
  45. sky/provision/do/instance.py +287 -0
  46. sky/provision/do/utils.py +306 -0
  47. sky/provision/docker_utils.py +22 -11
  48. sky/provision/gcp/instance_utils.py +15 -9
  49. sky/provision/kubernetes/instance.py +3 -2
  50. sky/provision/kubernetes/utils.py +125 -20
  51. sky/provision/oci/query_utils.py +17 -14
  52. sky/provision/provisioner.py +0 -1
  53. sky/provision/runpod/instance.py +10 -1
  54. sky/provision/runpod/utils.py +170 -13
  55. sky/resources.py +1 -1
  56. sky/serve/autoscalers.py +359 -301
  57. sky/serve/controller.py +10 -8
  58. sky/serve/core.py +84 -7
  59. sky/serve/load_balancer.py +27 -10
  60. sky/serve/replica_managers.py +1 -3
  61. sky/serve/serve_state.py +10 -5
  62. sky/serve/serve_utils.py +28 -1
  63. sky/serve/service.py +4 -3
  64. sky/serve/service_spec.py +31 -0
  65. sky/setup_files/dependencies.py +4 -1
  66. sky/skylet/constants.py +8 -4
  67. sky/skylet/events.py +7 -3
  68. sky/skylet/job_lib.py +10 -30
  69. sky/skylet/log_lib.py +8 -8
  70. sky/skylet/log_lib.pyi +3 -0
  71. sky/skylet/providers/command_runner.py +5 -7
  72. sky/skylet/skylet.py +1 -1
  73. sky/task.py +28 -1
  74. sky/templates/do-ray.yml.j2 +98 -0
  75. sky/templates/jobs-controller.yaml.j2 +41 -7
  76. sky/templates/runpod-ray.yml.j2 +13 -0
  77. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  78. sky/usage/usage_lib.py +10 -2
  79. sky/utils/accelerator_registry.py +12 -8
  80. sky/utils/controller_utils.py +114 -39
  81. sky/utils/db_utils.py +18 -4
  82. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  83. sky/utils/log_utils.py +2 -0
  84. sky/utils/resources_utils.py +25 -21
  85. sky/utils/schemas.py +27 -0
  86. sky/utils/subprocess_utils.py +54 -10
  87. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
  88. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
  89. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
  90. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  91. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '138679859b9844a8737f8dff1bf5a739e77e96c4'
8
+ _SKYPILOT_COMMIT_SHA = '1c94d0f001ed6519873a59a7b46681d64dd696d2'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241227'
38
+ __version__ = '1.0.0.dev20250124'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/common.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Lazy import for modules to avoid import error when not used."""
2
2
  import functools
3
3
  import importlib
4
+ import threading
4
5
  from typing import Any, Callable, Optional, Tuple
5
6
 
6
7
 
@@ -24,17 +25,22 @@ class LazyImport:
24
25
  self._module = None
25
26
  self._import_error_message = import_error_message
26
27
  self._set_loggers = set_loggers
28
+ self._lock = threading.RLock()
27
29
 
28
30
  def load_module(self):
29
- if self._module is None:
30
- try:
31
- self._module = importlib.import_module(self._module_name)
32
- if self._set_loggers is not None:
33
- self._set_loggers()
34
- except ImportError as e:
35
- if self._import_error_message is not None:
36
- raise ImportError(self._import_error_message) from e
37
- raise
31
+ # Avoid extra imports when multiple threads try to import the same
32
+ # module. The overhead is minor since import can only run in serial
33
+ # due to GIL even in multi-threaded environments.
34
+ with self._lock:
35
+ if self._module is None:
36
+ try:
37
+ self._module = importlib.import_module(self._module_name)
38
+ if self._set_loggers is not None:
39
+ self._set_loggers()
40
+ except ImportError as e:
41
+ if self._import_error_message is not None:
42
+ raise ImportError(self._import_error_message) from e
43
+ raise
38
44
  return self._module
39
45
 
40
46
  def __getattr__(self, name: str) -> Any:
sky/adaptors/do.py ADDED
@@ -0,0 +1,20 @@
1
+ """Digital Ocean cloud adaptors"""
2
+
3
+ # pylint: disable=import-outside-toplevel
4
+
5
+ from sky.adaptors import common
6
+
7
+ _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for DO. '
8
+ 'Try pip install "skypilot[do]"')
9
+ pydo = common.LazyImport('pydo', import_error_message=_IMPORT_ERROR_MESSAGE)
10
+ azure = common.LazyImport('azure', import_error_message=_IMPORT_ERROR_MESSAGE)
11
+ _LAZY_MODULES = (pydo, azure)
12
+
13
+
14
+ # `pydo`` inherits Azure exceptions. See:
15
+ # https://github.com/digitalocean/pydo/blob/7b01498d99eb0d3a772366b642e5fab3d6fc6aa2/examples/poc_droplets_volumes_sshkeys.py#L6
16
+ @common.load_lazy_modules(modules=_LAZY_MODULES)
17
+ def exceptions():
18
+ """Azure exceptions."""
19
+ from azure.core import exceptions as azure_exceptions
20
+ return azure_exceptions
sky/adaptors/oci.py CHANGED
@@ -1,9 +1,11 @@
1
1
  """Oracle OCI cloud adaptor"""
2
2
 
3
+ import functools
3
4
  import logging
4
5
  import os
5
6
 
6
7
  from sky.adaptors import common
8
+ from sky.clouds.utils import oci_utils
7
9
 
8
10
  # Suppress OCI circuit breaker logging before lazy import, because
9
11
  # oci modules prints additional message during imports, i.e., the
@@ -30,10 +32,16 @@ def get_config_file() -> str:
30
32
 
31
33
  def get_oci_config(region=None, profile='DEFAULT'):
32
34
  conf_file_path = get_config_file()
35
+ if not profile or profile == 'DEFAULT':
36
+ config_profile = oci_utils.oci_config.get_profile()
37
+ else:
38
+ config_profile = profile
39
+
33
40
  oci_config = oci.config.from_file(file_location=conf_file_path,
34
- profile_name=profile)
41
+ profile_name=config_profile)
35
42
  if region is not None:
36
43
  oci_config['region'] = region
44
+
37
45
  return oci_config
38
46
 
39
47
 
@@ -54,6 +62,29 @@ def get_identity_client(region=None, profile='DEFAULT'):
54
62
  return oci.identity.IdentityClient(get_oci_config(region, profile))
55
63
 
56
64
 
65
+ def get_object_storage_client(region=None, profile='DEFAULT'):
66
+ return oci.object_storage.ObjectStorageClient(
67
+ get_oci_config(region, profile))
68
+
69
+
57
70
  def service_exception():
58
71
  """OCI service exception."""
59
72
  return oci.exceptions.ServiceError
73
+
74
+
75
+ def with_oci_env(f):
76
+
77
+ @functools.wraps(f)
78
+ def wrapper(*args, **kwargs):
79
+ # pylint: disable=line-too-long
80
+ enter_env_cmds = [
81
+ 'conda info --envs | grep "sky-oci-cli-env" || conda create -n sky-oci-cli-env python=3.10 -y',
82
+ '. $(conda info --base 2> /dev/null)/etc/profile.d/conda.sh > /dev/null 2>&1 || true',
83
+ 'conda activate sky-oci-cli-env', 'pip install oci-cli',
84
+ 'export OCI_CLI_SUPPRESS_FILE_PERMISSIONS_WARNING=True'
85
+ ]
86
+ operation_cmd = [f(*args, **kwargs)]
87
+ leave_env_cmds = ['conda deactivate']
88
+ return ' && '.join(enter_env_cmds + operation_cmd + leave_env_cmds)
89
+
90
+ return wrapper
sky/authentication.py CHANGED
@@ -408,14 +408,26 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
408
408
  secret = k8s.client.V1Secret(
409
409
  metadata=k8s.client.V1ObjectMeta(**secret_metadata),
410
410
  string_data={secret_field_name: public_key})
411
- if kubernetes_utils.check_secret_exists(secret_name, namespace, context):
412
- logger.debug(f'Key {secret_name} exists in the cluster, patching it...')
413
- kubernetes.core_api(context).patch_namespaced_secret(
414
- secret_name, namespace, secret)
415
- else:
416
- logger.debug(
417
- f'Key {secret_name} does not exist in the cluster, creating it...')
418
- kubernetes.core_api(context).create_namespaced_secret(namespace, secret)
411
+ try:
412
+ if kubernetes_utils.check_secret_exists(secret_name, namespace,
413
+ context):
414
+ logger.debug(f'Key {secret_name} exists in the cluster, '
415
+ 'patching it...')
416
+ kubernetes.core_api(context).patch_namespaced_secret(
417
+ secret_name, namespace, secret)
418
+ else:
419
+ logger.debug(f'Key {secret_name} does not exist in the cluster, '
420
+ 'creating it...')
421
+ kubernetes.core_api(context).create_namespaced_secret(
422
+ namespace, secret)
423
+ except kubernetes.api_exception() as e:
424
+ if e.status == 409 and e.reason == 'AlreadyExists':
425
+ logger.debug(f'Key {secret_name} was created concurrently, '
426
+ 'patching it...')
427
+ kubernetes.core_api(context).patch_namespaced_secret(
428
+ secret_name, namespace, secret)
429
+ else:
430
+ raise e
419
431
 
420
432
  private_key_path, _ = get_or_generate_keys()
421
433
  if network_mode == nodeport_mode:
@@ -650,6 +650,42 @@ def _replace_yaml_dicts(
650
650
  return common_utils.dump_yaml_str(new_config)
651
651
 
652
652
 
653
+ def get_expirable_clouds(
654
+ enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]:
655
+ """Returns a list of clouds that use local credentials and whose credentials can expire.
656
+
657
+ This function checks each cloud in the provided sequence to determine if it uses local credentials
658
+ and if its credentials can expire. If both conditions are met, the cloud is added to the list of
659
+ expirable clouds.
660
+
661
+ Args:
662
+ enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check.
663
+
664
+ Returns:
665
+ list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire.
666
+ """
667
+ expirable_clouds = []
668
+ local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
669
+ for cloud in enabled_clouds:
670
+ remote_identities = skypilot_config.get_nested(
671
+ (str(cloud).lower(), 'remote_identity'), None)
672
+ if remote_identities is None:
673
+ remote_identities = schemas.get_default_remote_identity(
674
+ str(cloud).lower())
675
+
676
+ local_credential_expiring = cloud.can_credential_expire()
677
+ if isinstance(remote_identities, str):
678
+ if remote_identities == local_credentials_value and local_credential_expiring:
679
+ expirable_clouds.append(cloud)
680
+ elif isinstance(remote_identities, list):
681
+ for profile in remote_identities:
682
+ if list(profile.values(
683
+ ))[0] == local_credentials_value and local_credential_expiring:
684
+ expirable_clouds.append(cloud)
685
+ break
686
+ return expirable_clouds
687
+
688
+
653
689
  # TODO: too many things happening here - leaky abstraction. Refactor.
654
690
  @timeline.event
655
691
  def write_cluster_config(
@@ -926,6 +962,13 @@ def write_cluster_config(
926
962
  tmp_yaml_path,
927
963
  cluster_config_overrides=to_provision.cluster_config_overrides)
928
964
  kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
965
+ yaml_obj = common_utils.read_yaml(tmp_yaml_path)
966
+ pod_config = yaml_obj['available_node_types']['ray_head_default'][
967
+ 'node_config']
968
+ valid, message = kubernetes_utils.check_pod_config(pod_config)
969
+ if not valid:
970
+ raise exceptions.InvalidCloudConfigs(
971
+ f'Invalid pod_config. Details: {message}')
929
972
 
930
973
  if dryrun:
931
974
  # If dryrun, return the unfinished tmp yaml path.
@@ -1000,6 +1043,7 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
1000
1043
  clouds.Cudo,
1001
1044
  clouds.Paperspace,
1002
1045
  clouds.Azure,
1046
+ clouds.DO,
1003
1047
  )):
1004
1048
  config = auth.configure_ssh_info(config)
1005
1049
  elif isinstance(cloud, clouds.GCP):
@@ -10,6 +10,7 @@ import os
10
10
  import pathlib
11
11
  import re
12
12
  import shlex
13
+ import shutil
13
14
  import signal
14
15
  import subprocess
15
16
  import sys
@@ -26,6 +27,7 @@ import filelock
26
27
 
27
28
  import sky
28
29
  from sky import backends
30
+ from sky import check as sky_check
29
31
  from sky import cloud_stores
30
32
  from sky import clouds
31
33
  from sky import exceptions
@@ -34,7 +36,6 @@ from sky import jobs as managed_jobs
34
36
  from sky import optimizer
35
37
  from sky import provision as provision_lib
36
38
  from sky import resources as resources_lib
37
- from sky import serve as serve_lib
38
39
  from sky import sky_logging
39
40
  from sky import status_lib
40
41
  from sky import task as task_lib
@@ -44,6 +45,7 @@ from sky.clouds import service_catalog
44
45
  from sky.clouds.utils import gcp_utils
45
46
  from sky.data import data_utils
46
47
  from sky.data import storage as storage_lib
48
+ from sky.jobs import constants as managed_jobs_constants
47
49
  from sky.provision import common as provision_common
48
50
  from sky.provision import instance_setup
49
51
  from sky.provision import metadata_utils
@@ -154,6 +156,9 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
154
156
  # might be added during ssh.
155
157
  _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
156
158
 
159
+ _RESOURCES_UNAVAILABLE_LOG = (
160
+ 'Reasons for provision failures (for details, please check the log above):')
161
+
157
162
 
158
163
  def _is_command_length_over_limit(command: str) -> bool:
159
164
  """Check if the length of the command exceeds the limit.
@@ -178,6 +183,7 @@ def _get_cluster_config_template(cloud):
178
183
  clouds.SCP: 'scp-ray.yml.j2',
179
184
  clouds.OCI: 'oci-ray.yml.j2',
180
185
  clouds.Paperspace: 'paperspace-ray.yml.j2',
186
+ clouds.DO: 'do-ray.yml.j2',
181
187
  clouds.RunPod: 'runpod-ray.yml.j2',
182
188
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
183
189
  clouds.Vsphere: 'vsphere-ray.yml.j2',
@@ -1995,6 +2001,23 @@ class RetryingVmProvisioner(object):
1995
2001
  skip_unnecessary_provisioning else None)
1996
2002
 
1997
2003
  failover_history: List[Exception] = list()
2004
+ resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
2005
+ # If the user is using local credentials which may expire, the
2006
+ # controller may leak resources if the credentials expire while a job
2007
+ # is running. Here we check the enabled clouds and expiring credentials
2008
+ # and raise a warning to the user.
2009
+ if task.is_controller_task():
2010
+ enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
2011
+ expirable_clouds = backend_utils.get_expirable_clouds(
2012
+ enabled_clouds)
2013
+
2014
+ if len(expirable_clouds) > 0:
2015
+ warnings = (f'\033[93mWarning: Credentials used for '
2016
+ f'{expirable_clouds} may expire. Clusters may be '
2017
+ f'leaked if the credentials expire while jobs '
2018
+ f'are running. It is recommended to use credentials'
2019
+ f' that never expire or a service account.\033[0m')
2020
+ logger.warning(warnings)
1998
2021
 
1999
2022
  # Retrying launchable resources.
2000
2023
  while True:
@@ -2070,6 +2093,8 @@ class RetryingVmProvisioner(object):
2070
2093
  # Add failed resources to the blocklist, only when it
2071
2094
  # is in fallback mode.
2072
2095
  _add_to_blocked_resources(self._blocked_resources, to_provision)
2096
+ assert len(failover_history) > 0
2097
+ resource_exceptions[to_provision] = failover_history[-1]
2073
2098
  else:
2074
2099
  # If we reach here, it means that the existing cluster must have
2075
2100
  # a previous status of INIT, because other statuses (UP,
@@ -2114,7 +2139,14 @@ class RetryingVmProvisioner(object):
2114
2139
  # possible resources or the requested resources is too
2115
2140
  # restrictive. If we reach here, our failover logic finally
2116
2141
  # ends here.
2117
- raise e.with_failover_history(failover_history)
2142
+ table = log_utils.create_table(['Resource', 'Reason'])
2143
+ for (resource, exception) in resource_exceptions.items():
2144
+ table.add_row(
2145
+ [resources_utils.format_resource(resource), exception])
2146
+ table.max_table_width = shutil.get_terminal_size().columns
2147
+ raise exceptions.ResourcesUnavailableError(
2148
+ _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
2149
+ failover_history=failover_history)
2118
2150
  to_provision = task.best_resources
2119
2151
  assert task in self._dag.tasks, 'Internal logic error.'
2120
2152
  assert to_provision is not None, task
@@ -2877,7 +2909,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2877
2909
  'the `--retry-until-up` flag.')
2878
2910
  with ux_utils.print_exception_no_traceback():
2879
2911
  raise exceptions.ResourcesUnavailableError(
2880
- error_message,
2912
+ error_message + '\n' + str(e),
2881
2913
  failover_history=e.failover_history) from None
2882
2914
  if dryrun:
2883
2915
  record = global_user_state.get_cluster_from_name(cluster_name)
@@ -3309,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3309
3341
  # even if some of them raise exceptions. We should replace it with
3310
3342
  # multi-process.
3311
3343
  rich_utils.stop_safe_status()
3312
- subprocess_utils.run_in_parallel(_setup_node, range(num_nodes))
3344
+ subprocess_utils.run_in_parallel(_setup_node, list(range(num_nodes)))
3313
3345
 
3314
3346
  if detach_setup:
3315
3347
  # Only set this when setup needs to be run outside the self._setup()
@@ -3873,42 +3905,157 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3873
3905
  stdin=subprocess.DEVNULL,
3874
3906
  )
3875
3907
 
3876
- def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
3877
- service_name: str, target: serve_lib.ServiceComponent,
3878
- replica_id: Optional[int], follow: bool) -> None:
3879
- """Tail the logs of a service.
3908
+ def sync_down_managed_job_logs(
3909
+ self,
3910
+ handle: CloudVmRayResourceHandle,
3911
+ job_id: Optional[int] = None,
3912
+ job_name: Optional[str] = None,
3913
+ controller: bool = False,
3914
+ local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
3915
+ """Sync down logs for a managed job.
3880
3916
 
3881
3917
  Args:
3882
- handle: The handle to the sky serve controller.
3883
- service_name: The name of the service.
3884
- target: The component to tail the logs of. Could be controller,
3885
- load balancer, or replica.
3886
- replica_id: The replica ID to tail the logs of. Only used when
3887
- target is replica.
3888
- follow: Whether to follow the logs.
3889
- """
3890
- if target != serve_lib.ServiceComponent.REPLICA:
3891
- code = serve_lib.ServeCodeGen.stream_serve_process_logs(
3892
- service_name,
3893
- stream_controller=(
3894
- target == serve_lib.ServiceComponent.CONTROLLER),
3895
- follow=follow)
3896
- else:
3897
- assert replica_id is not None, service_name
3898
- code = serve_lib.ServeCodeGen.stream_replica_logs(
3899
- service_name, replica_id, follow)
3918
+ handle: The handle to the cluster.
3919
+ job_id: The job ID to sync down logs for.
3920
+ job_name: The job name to sync down logs for.
3921
+ controller: Whether to sync down logs for the controller.
3922
+ local_dir: The local directory to sync down logs to.
3900
3923
 
3901
- signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
3902
- signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
3924
+ Returns:
3925
+ A dictionary mapping job_id to log path.
3926
+ """
3927
+ # if job_name and job_id should not both be specified
3928
+ assert job_name is None or job_id is None, (job_name, job_id)
3903
3929
 
3904
- self.run_on_head(
3930
+ if job_id is None:
3931
+ # generate code to get the job_id
3932
+ # if job_name is None, get all job_ids
3933
+ # TODO: Only get the latest job_id, since that's the only one we use
3934
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
3935
+ job_name=job_name)
3936
+ returncode, job_ids, stderr = self.run_on_head(handle,
3937
+ code,
3938
+ stream_logs=False,
3939
+ require_outputs=True,
3940
+ separate_stderr=True)
3941
+ subprocess_utils.handle_returncode(returncode, code,
3942
+ 'Failed to sync down logs.',
3943
+ stderr)
3944
+ job_ids = common_utils.decode_payload(job_ids)
3945
+ if not job_ids:
3946
+ logger.info(f'{colorama.Fore.YELLOW}'
3947
+ 'No matching job found'
3948
+ f'{colorama.Style.RESET_ALL}')
3949
+ return {}
3950
+ elif len(job_ids) > 1:
3951
+ name_str = ''
3952
+ if job_name is not None:
3953
+ name_str = ('Multiple jobs IDs found under the name '
3954
+ f'{job_name}. ')
3955
+ logger.info(f'{colorama.Fore.YELLOW}'
3956
+ f'{name_str}'
3957
+ 'Downloading the latest job logs.'
3958
+ f'{colorama.Style.RESET_ALL}')
3959
+ # list should aready be in descending order
3960
+ job_id = job_ids[0]
3961
+
3962
+ # get the run_timestamp
3963
+ # the function takes in [job_id]
3964
+ code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
3965
+ [str(job_id)])
3966
+ returncode, run_timestamps, stderr = self.run_on_head(
3905
3967
  handle,
3906
3968
  code,
3907
- stream_logs=True,
3908
- process_stream=False,
3909
- ssh_mode=command_runner.SshMode.INTERACTIVE,
3910
- stdin=subprocess.DEVNULL,
3911
- )
3969
+ stream_logs=False,
3970
+ require_outputs=True,
3971
+ separate_stderr=True)
3972
+ subprocess_utils.handle_returncode(returncode, code,
3973
+ 'Failed to sync logs.', stderr)
3974
+ # returns with a dict of {job_id: run_timestamp}
3975
+ run_timestamps = common_utils.decode_payload(run_timestamps)
3976
+ if not run_timestamps:
3977
+ logger.info(f'{colorama.Fore.YELLOW}'
3978
+ 'No matching log directories found'
3979
+ f'{colorama.Style.RESET_ALL}')
3980
+ return {}
3981
+
3982
+ run_timestamp = list(run_timestamps.values())[0]
3983
+ job_id = list(run_timestamps.keys())[0]
3984
+ local_log_dir = ''
3985
+ if controller: # download controller logs
3986
+ remote_log = os.path.join(
3987
+ managed_jobs_constants.JOBS_CONTROLLER_LOGS_DIR,
3988
+ f'{job_id}.log')
3989
+ local_log_dir = os.path.expanduser(
3990
+ os.path.join(local_dir, run_timestamp))
3991
+
3992
+ logger.info(f'{colorama.Fore.CYAN}'
3993
+ f'Job {job_id} local logs: {local_log_dir}'
3994
+ f'{colorama.Style.RESET_ALL}')
3995
+
3996
+ runners = handle.get_command_runners()
3997
+
3998
+ def _rsync_down(args) -> None:
3999
+ """Rsync down logs from remote nodes.
4000
+
4001
+ Args:
4002
+ args: A tuple of (runner, local_log_dir, remote_log_dir)
4003
+ """
4004
+ (runner, local_log_dir, remote_log) = args
4005
+ try:
4006
+ os.makedirs(local_log_dir, exist_ok=True)
4007
+ runner.rsync(
4008
+ source=remote_log,
4009
+ target=f'{local_log_dir}/controller.log',
4010
+ up=False,
4011
+ stream_logs=False,
4012
+ )
4013
+ except exceptions.CommandError as e:
4014
+ if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
4015
+ # Raised by rsync_down. Remote log dir may not exist
4016
+ # since the job can be run on some part of the nodes.
4017
+ logger.debug(
4018
+ f'{runner.node_id} does not have the tasks/*.')
4019
+ else:
4020
+ raise
4021
+
4022
+ parallel_args = [
4023
+ (runner, local_log_dir, remote_log) for runner in runners
4024
+ ]
4025
+ subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
4026
+ else: # download job logs
4027
+ local_log_dir = os.path.expanduser(
4028
+ os.path.join(local_dir, 'managed_jobs', run_timestamp))
4029
+ os.makedirs(os.path.dirname(local_log_dir), exist_ok=True)
4030
+ log_file = os.path.join(local_log_dir, 'run.log')
4031
+
4032
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4033
+ job_id=job_id,
4034
+ follow=False,
4035
+ controller=False)
4036
+
4037
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4038
+ # kill the process, so we need to handle it manually here.
4039
+ if threading.current_thread() is threading.main_thread():
4040
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
4041
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
4042
+
4043
+ # We redirect the output to the log file
4044
+ # and disable the STDOUT and STDERR
4045
+ self.run_on_head(
4046
+ handle,
4047
+ code,
4048
+ log_path=log_file,
4049
+ stream_logs=False,
4050
+ process_stream=False,
4051
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
4052
+ stdin=subprocess.DEVNULL,
4053
+ )
4054
+
4055
+ logger.info(f'{colorama.Fore.CYAN}'
4056
+ f'Job {job_id} logs: {local_log_dir}'
4057
+ f'{colorama.Style.RESET_ALL}')
4058
+ return {str(job_id): local_log_dir}
3912
4059
 
3913
4060
  def teardown_no_lock(self,
3914
4061
  handle: CloudVmRayResourceHandle,
@@ -4198,11 +4345,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4198
4345
  attempts = 0
4199
4346
  while True:
4200
4347
  logger.debug(f'instance statuses attempt {attempts + 1}')
4201
- node_status_dict = provision_lib.query_instances(
4202
- repr(cloud),
4203
- cluster_name_on_cloud,
4204
- config['provider'],
4205
- non_terminated_only=False)
4348
+ try:
4349
+ node_status_dict = provision_lib.query_instances(
4350
+ repr(cloud),
4351
+ cluster_name_on_cloud,
4352
+ config['provider'],
4353
+ non_terminated_only=False)
4354
+ except Exception as e: # pylint: disable=broad-except
4355
+ if purge:
4356
+ logger.warning(
4357
+ f'Failed to query instances. Skipping since purge is '
4358
+ f'set. Details: '
4359
+ f'{common_utils.format_exception(e, use_bracket=True)}')
4360
+ break
4361
+ raise
4206
4362
 
4207
4363
  unexpected_node_state: Optional[Tuple[str, str]] = None
4208
4364
  for node_id, node_status in node_status_dict.items():
@@ -4221,8 +4377,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4221
4377
  time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
4222
4378
  else:
4223
4379
  (node_id, node_status) = unexpected_node_state
4224
- raise RuntimeError(f'Instance {node_id} in unexpected state '
4225
- f'{node_status}.')
4380
+ if purge:
4381
+ logger.warning(f'Instance {node_id} in unexpected '
4382
+ f'state {node_status}. Skipping since purge '
4383
+ 'is set.')
4384
+ break
4385
+ raise RuntimeError(f'Instance {node_id} in unexpected '
4386
+ f'state {node_status}.')
4226
4387
 
4227
4388
  global_user_state.remove_cluster(handle.cluster_name,
4228
4389
  terminate=terminate)
@@ -153,7 +153,10 @@ def build_sky_wheel() -> Tuple[pathlib.Path, str]:
153
153
  if not path.exists():
154
154
  return -1.
155
155
  try:
156
- return max(os.path.getmtime(root) for root, _, _ in os.walk(path))
156
+ return max(
157
+ os.path.getmtime(os.path.join(root, f))
158
+ for root, dirs, files in os.walk(path)
159
+ for f in (*dirs, *files))
157
160
  except ValueError:
158
161
  return -1.
159
162
 
sky/check.py CHANGED
@@ -155,7 +155,8 @@ def check(
155
155
  # Pretty print for UX.
156
156
  if not quiet:
157
157
  enabled_clouds_str = '\n :heavy_check_mark: '.join(
158
- [''] + sorted(all_enabled_clouds))
158
+ [''] +
159
+ [_format_enabled_cloud(c) for c in sorted(all_enabled_clouds)])
159
160
  rich.print('\n[green]:tada: Enabled clouds :tada:'
160
161
  f'{enabled_clouds_str}[/green]')
161
162
 
@@ -222,3 +223,32 @@ def get_cloud_credential_file_mounts(
222
223
  r2_credential_mounts = cloudflare.get_credential_file_mounts()
223
224
  file_mounts.update(r2_credential_mounts)
224
225
  return file_mounts
226
+
227
+
228
+ def _format_enabled_cloud(cloud_name: str) -> str:
229
+ if cloud_name == repr(sky_clouds.Kubernetes()):
230
+ # Get enabled contexts for Kubernetes
231
+ existing_contexts = sky_clouds.Kubernetes.existing_allowed_contexts()
232
+ if not existing_contexts:
233
+ return cloud_name
234
+
235
+ # Check if allowed_contexts is explicitly set in config
236
+ allowed_contexts = skypilot_config.get_nested(
237
+ ('kubernetes', 'allowed_contexts'), None)
238
+
239
+ # Format the context info with consistent styling
240
+ if allowed_contexts is not None:
241
+ contexts_formatted = []
242
+ for i, context in enumerate(existing_contexts):
243
+ # TODO: We should use ux_utils.INDENT_SYMBOL and
244
+ # INDENT_LAST_SYMBOL but, they are formatted for colorama, while
245
+ # here we are using rich. We should migrate this file to
246
+ # use colorama as we do in the rest of the codebase.
247
+ symbol = ('└── ' if i == len(existing_contexts) - 1 else '├── ')
248
+ contexts_formatted.append(f'\n {symbol}{context}')
249
+ context_info = f'Allowed contexts:{"".join(contexts_formatted)}'
250
+ else:
251
+ context_info = f'Active context: {existing_contexts[0]}'
252
+
253
+ return f'{cloud_name}[/green][dim]\n └── {context_info}[/dim][green]'
254
+ return cloud_name