skypilot-nightly 1.0.0.dev20241202__py3-none-any.whl → 1.0.0.dev20241204__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/backends/backend_utils.py +5 -4
- sky/backends/cloud_vm_ray_backend.py +27 -7
- sky/cli.py +11 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +3 -4
- sky/core.py +25 -18
- sky/exceptions.py +7 -0
- sky/execution.py +3 -2
- sky/jobs/controller.py +28 -8
- sky/jobs/core.py +61 -35
- sky/jobs/recovery_strategy.py +2 -1
- sky/jobs/state.py +33 -1
- sky/jobs/utils.py +16 -2
- sky/setup_files/dependencies.py +141 -0
- sky/setup_files/setup.py +12 -124
- sky/skylet/constants.py +36 -11
- sky/skylet/log_lib.py +3 -1
- sky/skylet/log_lib.pyi +3 -0
- sky/templates/kubernetes-ray.yml.j2 +1 -1
- sky/utils/controller_utils.py +60 -98
- {skypilot_nightly-1.0.0.dev20241202.dist-info → skypilot_nightly-1.0.0.dev20241204.dist-info}/METADATA +3 -2
- {skypilot_nightly-1.0.0.dev20241202.dist-info → skypilot_nightly-1.0.0.dev20241204.dist-info}/RECORD +26 -25
- {skypilot_nightly-1.0.0.dev20241202.dist-info → skypilot_nightly-1.0.0.dev20241204.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241202.dist-info → skypilot_nightly-1.0.0.dev20241204.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241202.dist-info → skypilot_nightly-1.0.0.dev20241204.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241202.dist-info → skypilot_nightly-1.0.0.dev20241204.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '51a7e177d99fdfe73a89c04dddc385940a97a37d'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241204'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -105,6 +105,7 @@ from sky.data import StorageMode
|
|
105
105
|
from sky.data import StoreType
|
106
106
|
from sky.execution import exec # pylint: disable=redefined-builtin
|
107
107
|
from sky.execution import launch
|
108
|
+
from sky.jobs import ManagedJobStatus
|
108
109
|
# TODO (zhwu): These imports are for backward compatibility, and spot APIs
|
109
110
|
# should be called with `sky.spot.xxx` instead. Remove in release 0.8.0
|
110
111
|
from sky.jobs.core import spot_cancel
|
@@ -163,6 +164,7 @@ __all__ = [
|
|
163
164
|
'StoreType',
|
164
165
|
'ClusterStatus',
|
165
166
|
'JobStatus',
|
167
|
+
'ManagedJobStatus',
|
166
168
|
# APIs
|
167
169
|
'Dag',
|
168
170
|
'Task',
|
sky/backends/backend_utils.py
CHANGED
@@ -1612,14 +1612,14 @@ def check_can_clone_disk_and_override_task(
|
|
1612
1612
|
The task to use and the resource handle of the source cluster.
|
1613
1613
|
|
1614
1614
|
Raises:
|
1615
|
-
|
1615
|
+
exceptions.ClusterDoesNotExist: If the source cluster does not exist.
|
1616
1616
|
exceptions.NotSupportedError: If the source cluster is not valid or the
|
1617
1617
|
task is not compatible to clone disk from the source cluster.
|
1618
1618
|
"""
|
1619
1619
|
source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
|
1620
1620
|
if source_cluster_status is None:
|
1621
1621
|
with ux_utils.print_exception_no_traceback():
|
1622
|
-
raise
|
1622
|
+
raise exceptions.ClusterDoesNotExist(
|
1623
1623
|
f'Cannot find cluster {cluster_name!r} to clone disk from.')
|
1624
1624
|
|
1625
1625
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
@@ -2136,7 +2136,7 @@ def check_cluster_available(
|
|
2136
2136
|
"""Check if the cluster is available.
|
2137
2137
|
|
2138
2138
|
Raises:
|
2139
|
-
|
2139
|
+
exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
2140
2140
|
exceptions.ClusterNotUpError: if the cluster is not UP.
|
2141
2141
|
exceptions.NotSupportedError: if the cluster is not based on
|
2142
2142
|
CloudVmRayBackend.
|
@@ -2201,7 +2201,8 @@ def check_cluster_available(
|
|
2201
2201
|
error_msg += message
|
2202
2202
|
|
2203
2203
|
with ux_utils.print_exception_no_traceback():
|
2204
|
-
raise
|
2204
|
+
raise exceptions.ClusterDoesNotExist(
|
2205
|
+
f'{colorama.Fore.YELLOW}{error_msg}{reset}')
|
2205
2206
|
assert cluster_status is not None, 'handle is not None but status is None'
|
2206
2207
|
backend = get_backend_from_handle(handle)
|
2207
2208
|
if check_cloud_vm_ray_backend and not isinstance(
|
@@ -301,6 +301,8 @@ class RayCodeGen:
|
|
301
301
|
)
|
302
302
|
def get_or_fail(futures, pg) -> List[int]:
|
303
303
|
\"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
|
304
|
+
if not futures:
|
305
|
+
return []
|
304
306
|
returncodes = [1] * len(futures)
|
305
307
|
# Wait for 1 task to be ready.
|
306
308
|
ready = []
|
@@ -3460,15 +3462,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3460
3462
|
Returns:
|
3461
3463
|
Job id if the task is submitted to the cluster, None otherwise.
|
3462
3464
|
"""
|
3463
|
-
if task.run is None:
|
3465
|
+
if task.run is None and self._setup_cmd is None:
|
3466
|
+
# This message is fine without mentioning setup, as there are three
|
3467
|
+
# cases when run section is empty:
|
3468
|
+
# 1. setup specified, no --detach-setup: setup is executed and this
|
3469
|
+
# message is fine for saying no run command specified.
|
3470
|
+
# 2. setup specified, with --detach-setup: setup is executed in
|
3471
|
+
# detached mode and this message will not be shown.
|
3472
|
+
# 3. no setup specified: this message is fine as a user is likely
|
3473
|
+
# creating a cluster only, and ok with the empty run command.
|
3464
3474
|
logger.info('Run commands not specified or empty.')
|
3465
3475
|
return None
|
3466
|
-
|
3467
|
-
|
3468
|
-
|
3469
|
-
|
3470
|
-
|
3471
|
-
|
3476
|
+
if task.run is None:
|
3477
|
+
# If the task has no run command, we still need to execute the
|
3478
|
+
# generated ray driver program to run the setup command in detached
|
3479
|
+
# mode.
|
3480
|
+
# In this case, we reset the resources for the task, so that the
|
3481
|
+
# detached setup does not need to wait for the task resources to be
|
3482
|
+
# ready (which is not used for setup anyway).
|
3483
|
+
valid_resource = sky.Resources()
|
3484
|
+
else:
|
3485
|
+
# Check the task resources vs the cluster resources. Since
|
3486
|
+
# `sky exec` will not run the provision and _check_existing_cluster
|
3487
|
+
# We need to check ports here since sky.exec shouldn't change
|
3488
|
+
# resources.
|
3489
|
+
valid_resource = self.check_resources_fit_cluster(handle,
|
3490
|
+
task,
|
3491
|
+
check_ports=True)
|
3472
3492
|
task_copy = copy.copy(task)
|
3473
3493
|
# Handle multiple resources exec case.
|
3474
3494
|
task_copy.set_resources(valid_resource)
|
sky/cli.py
CHANGED
@@ -3914,16 +3914,25 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3914
3914
|
default=False,
|
3915
3915
|
help=('Show the controller logs of this job; useful for debugging '
|
3916
3916
|
'launching/recoveries, etc.'))
|
3917
|
+
@click.option(
|
3918
|
+
'--refresh',
|
3919
|
+
'-r',
|
3920
|
+
default=False,
|
3921
|
+
is_flag=True,
|
3922
|
+
required=False,
|
3923
|
+
help='Query the latest job logs, restarting the jobs controller if stopped.'
|
3924
|
+
)
|
3917
3925
|
@click.argument('job_id', required=False, type=int)
|
3918
3926
|
@usage_lib.entrypoint
|
3919
3927
|
def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
3920
|
-
controller: bool):
|
3928
|
+
controller: bool, refresh: bool):
|
3921
3929
|
"""Tail the log of a managed job."""
|
3922
3930
|
try:
|
3923
3931
|
managed_jobs.tail_logs(name=name,
|
3924
3932
|
job_id=job_id,
|
3925
3933
|
follow=follow,
|
3926
|
-
controller=controller
|
3934
|
+
controller=controller,
|
3935
|
+
refresh=refresh)
|
3927
3936
|
except exceptions.ClusterNotUpError:
|
3928
3937
|
with ux_utils.print_exception_no_traceback():
|
3929
3938
|
raise
|
@@ -239,13 +239,12 @@ def _list_accelerators(
|
|
239
239
|
|
240
240
|
accelerators_available = accelerator_count - allocated_qty
|
241
241
|
|
242
|
-
if accelerator_name not in total_accelerators_available:
|
243
|
-
total_accelerators_available[accelerator_name] = 0
|
244
242
|
if accelerators_available >= min_quantity_filter:
|
245
243
|
quantized_availability = min_quantity_filter * (
|
246
244
|
accelerators_available // min_quantity_filter)
|
247
|
-
total_accelerators_available[
|
248
|
-
accelerator_name
|
245
|
+
total_accelerators_available[accelerator_name] = (
|
246
|
+
total_accelerators_available.get(accelerator_name, 0) +
|
247
|
+
quantized_availability)
|
249
248
|
|
250
249
|
result = []
|
251
250
|
|
sky/core.py
CHANGED
@@ -268,7 +268,8 @@ def _start(
|
|
268
268
|
cluster_status, handle = backend_utils.refresh_cluster_status_handle(
|
269
269
|
cluster_name)
|
270
270
|
if handle is None:
|
271
|
-
raise
|
271
|
+
raise exceptions.ClusterDoesNotExist(
|
272
|
+
f'Cluster {cluster_name!r} does not exist.')
|
272
273
|
if not force and cluster_status == status_lib.ClusterStatus.UP:
|
273
274
|
sky_logging.print(f'Cluster {cluster_name!r} is already up.')
|
274
275
|
return handle
|
@@ -359,12 +360,13 @@ def start(
|
|
359
360
|
Useful for upgrading SkyPilot runtime.
|
360
361
|
|
361
362
|
Raises:
|
362
|
-
ValueError: argument values are invalid: (1)
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
363
|
+
ValueError: argument values are invalid: (1) if ``down`` is set to True
|
364
|
+
but ``idle_minutes_to_autostop`` is None; (2) if the specified
|
365
|
+
cluster is the managed jobs controller, and either
|
366
|
+
``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
|
367
|
+
them to use the default autostop settings).
|
368
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
369
|
+
exist.
|
368
370
|
sky.exceptions.NotSupportedError: if the cluster to restart was
|
369
371
|
launched using a non-default backend that does not support this
|
370
372
|
operation.
|
@@ -412,7 +414,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
412
414
|
related resources.
|
413
415
|
|
414
416
|
Raises:
|
415
|
-
|
417
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
418
|
+
exist.
|
416
419
|
RuntimeError: failed to stop the cluster.
|
417
420
|
sky.exceptions.NotSupportedError: if the specified cluster is a spot
|
418
421
|
cluster, or a TPU VM Pod cluster, or the managed jobs controller.
|
@@ -423,7 +426,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
423
426
|
f'is not supported.')
|
424
427
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
425
428
|
if handle is None:
|
426
|
-
raise
|
429
|
+
raise exceptions.ClusterDoesNotExist(
|
430
|
+
f'Cluster {cluster_name!r} does not exist.')
|
427
431
|
|
428
432
|
backend = backend_utils.get_backend_from_handle(handle)
|
429
433
|
|
@@ -467,14 +471,16 @@ def down(cluster_name: str, purge: bool = False) -> None:
|
|
467
471
|
resources.
|
468
472
|
|
469
473
|
Raises:
|
470
|
-
|
474
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
475
|
+
exist.
|
471
476
|
RuntimeError: failed to tear down the cluster.
|
472
477
|
sky.exceptions.NotSupportedError: the specified cluster is the managed
|
473
478
|
jobs controller.
|
474
479
|
"""
|
475
480
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
476
481
|
if handle is None:
|
477
|
-
raise
|
482
|
+
raise exceptions.ClusterDoesNotExist(
|
483
|
+
f'Cluster {cluster_name!r} does not exist.')
|
478
484
|
|
479
485
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
480
486
|
backend = backend_utils.get_backend_from_handle(handle)
|
@@ -521,7 +527,7 @@ def autostop(
|
|
521
527
|
rather than autostop (restartable).
|
522
528
|
|
523
529
|
Raises:
|
524
|
-
|
530
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
525
531
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
526
532
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
527
533
|
CloudVmRayBackend or the cluster is TPU VM Pod.
|
@@ -615,7 +621,7 @@ def queue(cluster_name: str,
|
|
615
621
|
}
|
616
622
|
]
|
617
623
|
raises:
|
618
|
-
|
624
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
619
625
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
620
626
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
621
627
|
CloudVmRayBackend.
|
@@ -674,7 +680,8 @@ def cancel(
|
|
674
680
|
worker node is preempted in the spot cluster.
|
675
681
|
|
676
682
|
Raises:
|
677
|
-
ValueError: if arguments are invalid
|
683
|
+
ValueError: if arguments are invalid.
|
684
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
678
685
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
679
686
|
sky.exceptions.NotSupportedError: if the specified cluster is a
|
680
687
|
controller that does not support this operation.
|
@@ -750,8 +757,8 @@ def tail_logs(cluster_name: str,
|
|
750
757
|
Please refer to the sky.cli.tail_logs for the document.
|
751
758
|
|
752
759
|
Raises:
|
753
|
-
ValueError: arguments are invalid or the cluster is not supported
|
754
|
-
|
760
|
+
ValueError: if arguments are invalid or the cluster is not supported.
|
761
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
755
762
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
756
763
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
757
764
|
CloudVmRayBackend.
|
@@ -793,7 +800,7 @@ def download_logs(
|
|
793
800
|
Returns:
|
794
801
|
Dict[str, str]: a mapping of job_id to local log path.
|
795
802
|
Raises:
|
796
|
-
|
803
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
797
804
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
798
805
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
799
806
|
CloudVmRayBackend.
|
@@ -838,7 +845,7 @@ def job_status(cluster_name: str,
|
|
838
845
|
If job_ids is None and there is no job on the cluster, it will return
|
839
846
|
{None: None}.
|
840
847
|
Raises:
|
841
|
-
|
848
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
842
849
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
843
850
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
844
851
|
CloudVmRayBackend.
|
sky/exceptions.py
CHANGED
@@ -132,6 +132,13 @@ class ClusterSetUpError(Exception):
|
|
132
132
|
pass
|
133
133
|
|
134
134
|
|
135
|
+
class ClusterDoesNotExist(ValueError):
|
136
|
+
"""Raise when trying to operate on a cluster that does not exist."""
|
137
|
+
# This extends ValueError for compatibility reasons - we used to throw
|
138
|
+
# ValueError instead of this.
|
139
|
+
pass
|
140
|
+
|
141
|
+
|
135
142
|
class NotSupportedError(Exception):
|
136
143
|
"""Raised when a feature is not supported."""
|
137
144
|
pass
|
sky/execution.py
CHANGED
@@ -581,8 +581,9 @@ def exec( # pylint: disable=redefined-builtin
|
|
581
581
|
submitted.
|
582
582
|
|
583
583
|
Raises:
|
584
|
-
ValueError: if the specified cluster
|
585
|
-
|
584
|
+
ValueError: if the specified cluster is not in UP status.
|
585
|
+
sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
|
586
|
+
exist.
|
586
587
|
sky.exceptions.NotSupportedError: if the specified cluster is a
|
587
588
|
controller that does not support this operation.
|
588
589
|
|
sky/jobs/controller.py
CHANGED
@@ -6,7 +6,7 @@ import pathlib
|
|
6
6
|
import time
|
7
7
|
import traceback
|
8
8
|
import typing
|
9
|
-
from typing import Tuple
|
9
|
+
from typing import Optional, Tuple
|
10
10
|
|
11
11
|
import filelock
|
12
12
|
|
@@ -87,18 +87,28 @@ class JobsController:
|
|
87
87
|
task.update_envs(task_envs)
|
88
88
|
|
89
89
|
def _download_log_and_stream(
|
90
|
-
|
91
|
-
|
92
|
-
|
90
|
+
self, task_id: Optional[int],
|
91
|
+
handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
|
92
|
+
) -> None:
|
93
|
+
"""Downloads and streams the logs of the current job with given task ID.
|
93
94
|
|
94
95
|
We do not stream the logs from the cluster directly, as the
|
95
96
|
donwload and stream should be faster, and more robust against
|
96
97
|
preemptions or ssh disconnection during the streaming.
|
97
98
|
"""
|
99
|
+
if handle is None:
|
100
|
+
logger.info(f'Cluster for job {self._job_id} is not found. '
|
101
|
+
'Skipping downloading and streaming the logs.')
|
102
|
+
return
|
98
103
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
99
104
|
'managed_jobs')
|
100
|
-
controller_utils.download_and_stream_latest_job_log(
|
105
|
+
log_file = controller_utils.download_and_stream_latest_job_log(
|
101
106
|
self._backend, handle, managed_job_logs_dir)
|
107
|
+
if log_file is not None:
|
108
|
+
# Set the path of the log file for the current task, so it can be
|
109
|
+
# accessed even after the job is finished
|
110
|
+
managed_job_state.set_local_log_file(self._job_id, task_id,
|
111
|
+
log_file)
|
102
112
|
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
103
113
|
|
104
114
|
def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
|
@@ -213,7 +223,8 @@ class JobsController:
|
|
213
223
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
214
224
|
end_time = managed_job_utils.get_job_timestamp(
|
215
225
|
self._backend, cluster_name, get_end_time=True)
|
216
|
-
# The job is done.
|
226
|
+
# The job is done. Set the job to SUCCEEDED first before start
|
227
|
+
# downloading and streaming the logs to make it more responsive.
|
217
228
|
managed_job_state.set_succeeded(self._job_id,
|
218
229
|
task_id,
|
219
230
|
end_time=end_time,
|
@@ -221,12 +232,21 @@ class JobsController:
|
|
221
232
|
logger.info(
|
222
233
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
223
234
|
f'Cleaning up the cluster {cluster_name}.')
|
235
|
+
clusters = backend_utils.get_clusters(
|
236
|
+
cluster_names=[cluster_name],
|
237
|
+
refresh=False,
|
238
|
+
include_controller=False)
|
239
|
+
if clusters:
|
240
|
+
assert len(clusters) == 1, (clusters, cluster_name)
|
241
|
+
handle = clusters[0].get('handle')
|
242
|
+
# Best effort to download and stream the logs.
|
243
|
+
self._download_log_and_stream(task_id, handle)
|
224
244
|
# Only clean up the cluster, not the storages, because tasks may
|
225
245
|
# share storages.
|
226
246
|
recovery_strategy.terminate_cluster(cluster_name=cluster_name)
|
227
247
|
return True
|
228
248
|
|
229
|
-
# For single-node jobs,
|
249
|
+
# For single-node jobs, non-terminated job_status indicates a
|
230
250
|
# healthy cluster. We can safely continue monitoring.
|
231
251
|
# For multi-node jobs, since the job may not be set to FAILED
|
232
252
|
# immediately (depending on user program) when only some of the
|
@@ -278,7 +298,7 @@ class JobsController:
|
|
278
298
|
'The user job failed. Please check the logs below.\n'
|
279
299
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
280
300
|
|
281
|
-
self._download_log_and_stream(handle)
|
301
|
+
self._download_log_and_stream(task_id, handle)
|
282
302
|
managed_job_status = (
|
283
303
|
managed_job_state.ManagedJobStatus.FAILED)
|
284
304
|
if job_status == job_lib.JobStatus.FAILED_SETUP:
|
sky/jobs/core.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""SDK functions for managed jobs."""
|
2
2
|
import os
|
3
3
|
import tempfile
|
4
|
+
import typing
|
4
5
|
from typing import Any, Dict, List, Optional, Union
|
5
6
|
import uuid
|
6
7
|
|
@@ -29,6 +30,9 @@ from sky.utils import subprocess_utils
|
|
29
30
|
from sky.utils import timeline
|
30
31
|
from sky.utils import ux_utils
|
31
32
|
|
33
|
+
if typing.TYPE_CHECKING:
|
34
|
+
from sky.backends import cloud_vm_ray_backend
|
35
|
+
|
32
36
|
|
33
37
|
@timeline.event
|
34
38
|
@usage_lib.entrypoint
|
@@ -225,6 +229,40 @@ def queue_from_kubernetes_pod(
|
|
225
229
|
return jobs
|
226
230
|
|
227
231
|
|
232
|
+
def _maybe_restart_controller(
|
233
|
+
refresh: bool, stopped_message: str, spinner_message: str
|
234
|
+
) -> 'cloud_vm_ray_backend.CloudVmRayResourceHandle':
|
235
|
+
"""Restart controller if refresh is True and it is stopped."""
|
236
|
+
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
237
|
+
if refresh:
|
238
|
+
stopped_message = ''
|
239
|
+
try:
|
240
|
+
handle = backend_utils.is_controller_accessible(
|
241
|
+
controller=jobs_controller_type, stopped_message=stopped_message)
|
242
|
+
except exceptions.ClusterNotUpError as e:
|
243
|
+
if not refresh:
|
244
|
+
raise
|
245
|
+
handle = None
|
246
|
+
controller_status = e.cluster_status
|
247
|
+
|
248
|
+
if handle is not None:
|
249
|
+
return handle
|
250
|
+
|
251
|
+
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
252
|
+
f'Restarting {jobs_controller_type.value.name}...'
|
253
|
+
f'{colorama.Style.RESET_ALL}')
|
254
|
+
|
255
|
+
rich_utils.force_update_status(
|
256
|
+
ux_utils.spinner_message(f'{spinner_message} - restarting '
|
257
|
+
'controller'))
|
258
|
+
handle = sky.start(jobs_controller_type.value.cluster_name)
|
259
|
+
controller_status = status_lib.ClusterStatus.UP
|
260
|
+
rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
|
261
|
+
|
262
|
+
assert handle is not None, (controller_status, refresh)
|
263
|
+
return handle
|
264
|
+
|
265
|
+
|
228
266
|
@usage_lib.entrypoint
|
229
267
|
def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
230
268
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
@@ -252,34 +290,11 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
|
252
290
|
does not exist.
|
253
291
|
RuntimeError: if failed to get the managed jobs with ssh.
|
254
292
|
"""
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
handle = backend_utils.is_controller_accessible(
|
261
|
-
controller=jobs_controller_type, stopped_message=stopped_message)
|
262
|
-
except exceptions.ClusterNotUpError as e:
|
263
|
-
if not refresh:
|
264
|
-
raise
|
265
|
-
handle = None
|
266
|
-
controller_status = e.cluster_status
|
267
|
-
|
268
|
-
if refresh and handle is None:
|
269
|
-
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
270
|
-
'Restarting controller for latest status...'
|
271
|
-
f'{colorama.Style.RESET_ALL}')
|
272
|
-
|
273
|
-
rich_utils.force_update_status(
|
274
|
-
ux_utils.spinner_message('Checking managed jobs - restarting '
|
275
|
-
'controller'))
|
276
|
-
handle = sky.start(jobs_controller_type.value.cluster_name)
|
277
|
-
controller_status = status_lib.ClusterStatus.UP
|
278
|
-
rich_utils.force_update_status(
|
279
|
-
ux_utils.spinner_message('Checking managed jobs'))
|
280
|
-
|
281
|
-
assert handle is not None, (controller_status, refresh)
|
282
|
-
|
293
|
+
handle = _maybe_restart_controller(refresh,
|
294
|
+
stopped_message='No in-progress '
|
295
|
+
'managed jobs.',
|
296
|
+
spinner_message='Checking '
|
297
|
+
'managed jobs')
|
283
298
|
backend = backend_utils.get_backend_from_handle(handle)
|
284
299
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
285
300
|
|
@@ -371,7 +386,7 @@ def cancel(name: Optional[str] = None,
|
|
371
386
|
|
372
387
|
@usage_lib.entrypoint
|
373
388
|
def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
374
|
-
controller: bool) -> None:
|
389
|
+
controller: bool, refresh: bool) -> None:
|
375
390
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
376
391
|
"""Tail logs of managed jobs.
|
377
392
|
|
@@ -382,15 +397,26 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
382
397
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
383
398
|
"""
|
384
399
|
# TODO(zhwu): Automatically restart the jobs controller
|
400
|
+
if name is not None and job_id is not None:
|
401
|
+
with ux_utils.print_exception_no_traceback():
|
402
|
+
raise ValueError('Cannot specify both name and job_id.')
|
403
|
+
|
385
404
|
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
386
|
-
|
387
|
-
|
405
|
+
job_name_or_id_str = ''
|
406
|
+
if job_id is not None:
|
407
|
+
job_name_or_id_str = str(job_id)
|
408
|
+
elif name is not None:
|
409
|
+
job_name_or_id_str = f'-n {name}'
|
410
|
+
else:
|
411
|
+
job_name_or_id_str = ''
|
412
|
+
handle = _maybe_restart_controller(
|
413
|
+
refresh,
|
388
414
|
stopped_message=(
|
389
|
-
'
|
390
|
-
f'
|
415
|
+
f'{jobs_controller_type.value.name.capitalize()} is stopped. To '
|
416
|
+
f'get the logs, run: {colorama.Style.BRIGHT}sky jobs logs '
|
417
|
+
f'-r {job_name_or_id_str}{colorama.Style.RESET_ALL}'),
|
418
|
+
spinner_message='Retrieving job logs')
|
391
419
|
|
392
|
-
if name is not None and job_id is not None:
|
393
|
-
raise ValueError('Cannot specify both name and job_id.')
|
394
420
|
backend = backend_utils.get_backend_from_handle(handle)
|
395
421
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
396
422
|
|
sky/jobs/recovery_strategy.py
CHANGED
@@ -50,8 +50,9 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
|
|
50
50
|
usage_lib.messages.usage.set_internal()
|
51
51
|
sky.down(cluster_name)
|
52
52
|
return
|
53
|
-
except
|
53
|
+
except exceptions.ClusterDoesNotExist:
|
54
54
|
# The cluster is already down.
|
55
|
+
logger.debug(f'The cluster {cluster_name} is already down.')
|
55
56
|
return
|
56
57
|
except Exception as e: # pylint: disable=broad-except
|
57
58
|
retry_cnt += 1
|
sky/jobs/state.py
CHANGED
@@ -66,7 +66,8 @@ def create_table(cursor, conn):
|
|
66
66
|
spot_job_id INTEGER,
|
67
67
|
task_id INTEGER DEFAULT 0,
|
68
68
|
task_name TEXT,
|
69
|
-
specs TEXT
|
69
|
+
specs TEXT,
|
70
|
+
local_log_file TEXT DEFAULT NULL)""")
|
70
71
|
conn.commit()
|
71
72
|
|
72
73
|
db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
|
@@ -103,6 +104,8 @@ def create_table(cursor, conn):
|
|
103
104
|
value_to_replace_existing_entries=json.dumps({
|
104
105
|
'max_restarts_on_errors': 0,
|
105
106
|
}))
|
107
|
+
db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
|
108
|
+
'TEXT DEFAULT NULL')
|
106
109
|
|
107
110
|
# `job_info` contains the mapping from job_id to the job_name.
|
108
111
|
# In the future, it may contain more information about each job.
|
@@ -157,6 +160,7 @@ columns = [
|
|
157
160
|
'task_id',
|
158
161
|
'task_name',
|
159
162
|
'specs',
|
163
|
+
'local_log_file',
|
160
164
|
# columns from the job_info table
|
161
165
|
'_job_info_job_id', # This should be the same as job_id
|
162
166
|
'job_name',
|
@@ -512,6 +516,20 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
|
|
512
516
|
callback_func('CANCELLED')
|
513
517
|
|
514
518
|
|
519
|
+
def set_local_log_file(job_id: int, task_id: Optional[int],
|
520
|
+
local_log_file: str):
|
521
|
+
"""Set the local log file for a job."""
|
522
|
+
filter_str = 'spot_job_id=(?)'
|
523
|
+
filter_args = [local_log_file, job_id]
|
524
|
+
if task_id is not None:
|
525
|
+
filter_str += ' AND task_id=(?)'
|
526
|
+
filter_args.append(task_id)
|
527
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
528
|
+
cursor.execute(
|
529
|
+
'UPDATE spot SET local_log_file=(?) '
|
530
|
+
f'WHERE {filter_str}', filter_args)
|
531
|
+
|
532
|
+
|
515
533
|
# ======== utility functions ========
|
516
534
|
def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
|
517
535
|
"""Get non-terminal job ids by name."""
|
@@ -662,3 +680,17 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
|
|
662
680
|
WHERE spot_job_id=(?) AND task_id=(?)""",
|
663
681
|
(job_id, task_id)).fetchone()
|
664
682
|
return json.loads(task_specs[0])
|
683
|
+
|
684
|
+
|
685
|
+
def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
|
686
|
+
"""Get the local log directory for a job."""
|
687
|
+
filter_str = 'spot_job_id=(?)'
|
688
|
+
filter_args = [job_id]
|
689
|
+
if task_id is not None:
|
690
|
+
filter_str += ' AND task_id=(?)'
|
691
|
+
filter_args.append(task_id)
|
692
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
693
|
+
local_log_file = cursor.execute(
|
694
|
+
f'SELECT local_log_file FROM spot '
|
695
|
+
f'WHERE {filter_str}', filter_args).fetchone()
|
696
|
+
return local_log_file[-1] if local_log_file else None
|
sky/jobs/utils.py
CHANGED
@@ -327,10 +327,24 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
327
327
|
if managed_job_status.is_failed():
|
328
328
|
job_msg = ('\nFailure reason: '
|
329
329
|
f'{managed_job_state.get_failure_reason(job_id)}')
|
330
|
+
log_file = managed_job_state.get_local_log_file(job_id, None)
|
331
|
+
if log_file is not None:
|
332
|
+
with open(log_file, 'r', encoding='utf-8') as f:
|
333
|
+
# Stream the logs to the console without reading the whole
|
334
|
+
# file into memory.
|
335
|
+
start_streaming = False
|
336
|
+
for line in f:
|
337
|
+
if log_lib.LOG_FILE_START_STREAMING_AT in line:
|
338
|
+
start_streaming = True
|
339
|
+
if start_streaming:
|
340
|
+
print(line, end='', flush=True)
|
341
|
+
return ''
|
330
342
|
return (f'{colorama.Fore.YELLOW}'
|
331
343
|
f'Job {job_id} is already in terminal state '
|
332
|
-
f'{managed_job_status.value}.
|
333
|
-
f'{
|
344
|
+
f'{managed_job_status.value}. For more details, run: '
|
345
|
+
f'sky jobs logs --controller {job_id}'
|
346
|
+
f'{colorama.Style.RESET_ALL}'
|
347
|
+
f'{job_msg}')
|
334
348
|
backend = backends.CloudVmRayBackend()
|
335
349
|
task_id, managed_job_status = (
|
336
350
|
managed_job_state.get_latest_task_id_status(job_id))
|