skypilot-nightly 1.0.0.dev20241202__py3-none-any.whl → 1.0.0.dev20241204__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '68723df97c7c981887ba9100c510aca953f45c11'
8
+ _SKYPILOT_COMMIT_SHA = '51a7e177d99fdfe73a89c04dddc385940a97a37d'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241202'
38
+ __version__ = '1.0.0.dev20241204'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -105,6 +105,7 @@ from sky.data import StorageMode
105
105
  from sky.data import StoreType
106
106
  from sky.execution import exec # pylint: disable=redefined-builtin
107
107
  from sky.execution import launch
108
+ from sky.jobs import ManagedJobStatus
108
109
  # TODO (zhwu): These imports are for backward compatibility, and spot APIs
109
110
  # should be called with `sky.spot.xxx` instead. Remove in release 0.8.0
110
111
  from sky.jobs.core import spot_cancel
@@ -163,6 +164,7 @@ __all__ = [
163
164
  'StoreType',
164
165
  'ClusterStatus',
165
166
  'JobStatus',
167
+ 'ManagedJobStatus',
166
168
  # APIs
167
169
  'Dag',
168
170
  'Task',
@@ -1612,14 +1612,14 @@ def check_can_clone_disk_and_override_task(
1612
1612
  The task to use and the resource handle of the source cluster.
1613
1613
 
1614
1614
  Raises:
1615
- ValueError: If the source cluster does not exist.
1615
+ exceptions.ClusterDoesNotExist: If the source cluster does not exist.
1616
1616
  exceptions.NotSupportedError: If the source cluster is not valid or the
1617
1617
  task is not compatible to clone disk from the source cluster.
1618
1618
  """
1619
1619
  source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
1620
1620
  if source_cluster_status is None:
1621
1621
  with ux_utils.print_exception_no_traceback():
1622
- raise ValueError(
1622
+ raise exceptions.ClusterDoesNotExist(
1623
1623
  f'Cannot find cluster {cluster_name!r} to clone disk from.')
1624
1624
 
1625
1625
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
@@ -2136,7 +2136,7 @@ def check_cluster_available(
2136
2136
  """Check if the cluster is available.
2137
2137
 
2138
2138
  Raises:
2139
- ValueError: if the cluster does not exist.
2139
+ exceptions.ClusterDoesNotExist: if the cluster does not exist.
2140
2140
  exceptions.ClusterNotUpError: if the cluster is not UP.
2141
2141
  exceptions.NotSupportedError: if the cluster is not based on
2142
2142
  CloudVmRayBackend.
@@ -2201,7 +2201,8 @@ def check_cluster_available(
2201
2201
  error_msg += message
2202
2202
 
2203
2203
  with ux_utils.print_exception_no_traceback():
2204
- raise ValueError(f'{colorama.Fore.YELLOW}{error_msg}{reset}')
2204
+ raise exceptions.ClusterDoesNotExist(
2205
+ f'{colorama.Fore.YELLOW}{error_msg}{reset}')
2205
2206
  assert cluster_status is not None, 'handle is not None but status is None'
2206
2207
  backend = get_backend_from_handle(handle)
2207
2208
  if check_cloud_vm_ray_backend and not isinstance(
@@ -301,6 +301,8 @@ class RayCodeGen:
301
301
  )
302
302
  def get_or_fail(futures, pg) -> List[int]:
303
303
  \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
304
+ if not futures:
305
+ return []
304
306
  returncodes = [1] * len(futures)
305
307
  # Wait for 1 task to be ready.
306
308
  ready = []
@@ -3460,15 +3462,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3460
3462
  Returns:
3461
3463
  Job id if the task is submitted to the cluster, None otherwise.
3462
3464
  """
3463
- if task.run is None:
3465
+ if task.run is None and self._setup_cmd is None:
3466
+ # This message is fine without mentioning setup, as there are three
3467
+ # cases when run section is empty:
3468
+ # 1. setup specified, no --detach-setup: setup is executed and this
3469
+ # message is fine for saying no run command specified.
3470
+ # 2. setup specified, with --detach-setup: setup is executed in
3471
+ # detached mode and this message will not be shown.
3472
+ # 3. no setup specified: this message is fine as a user is likely
3473
+ # creating a cluster only, and ok with the empty run command.
3464
3474
  logger.info('Run commands not specified or empty.')
3465
3475
  return None
3466
- # Check the task resources vs the cluster resources. Since `sky exec`
3467
- # will not run the provision and _check_existing_cluster
3468
- # We need to check ports here since sky.exec shouldn't change resources
3469
- valid_resource = self.check_resources_fit_cluster(handle,
3470
- task,
3471
- check_ports=True)
3476
+ if task.run is None:
3477
+ # If the task has no run command, we still need to execute the
3478
+ # generated ray driver program to run the setup command in detached
3479
+ # mode.
3480
+ # In this case, we reset the resources for the task, so that the
3481
+ # detached setup does not need to wait for the task resources to be
3482
+ # ready (which is not used for setup anyway).
3483
+ valid_resource = sky.Resources()
3484
+ else:
3485
+ # Check the task resources vs the cluster resources. Since
3486
+ # `sky exec` will not run the provision and _check_existing_cluster
3487
+ # We need to check ports here since sky.exec shouldn't change
3488
+ # resources.
3489
+ valid_resource = self.check_resources_fit_cluster(handle,
3490
+ task,
3491
+ check_ports=True)
3472
3492
  task_copy = copy.copy(task)
3473
3493
  # Handle multiple resources exec case.
3474
3494
  task_copy.set_resources(valid_resource)
sky/cli.py CHANGED
@@ -3914,16 +3914,25 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3914
3914
  default=False,
3915
3915
  help=('Show the controller logs of this job; useful for debugging '
3916
3916
  'launching/recoveries, etc.'))
3917
+ @click.option(
3918
+ '--refresh',
3919
+ '-r',
3920
+ default=False,
3921
+ is_flag=True,
3922
+ required=False,
3923
+ help='Query the latest job logs, restarting the jobs controller if stopped.'
3924
+ )
3917
3925
  @click.argument('job_id', required=False, type=int)
3918
3926
  @usage_lib.entrypoint
3919
3927
  def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
3920
- controller: bool):
3928
+ controller: bool, refresh: bool):
3921
3929
  """Tail the log of a managed job."""
3922
3930
  try:
3923
3931
  managed_jobs.tail_logs(name=name,
3924
3932
  job_id=job_id,
3925
3933
  follow=follow,
3926
- controller=controller)
3934
+ controller=controller,
3935
+ refresh=refresh)
3927
3936
  except exceptions.ClusterNotUpError:
3928
3937
  with ux_utils.print_exception_no_traceback():
3929
3938
  raise
@@ -239,13 +239,12 @@ def _list_accelerators(
239
239
 
240
240
  accelerators_available = accelerator_count - allocated_qty
241
241
 
242
- if accelerator_name not in total_accelerators_available:
243
- total_accelerators_available[accelerator_name] = 0
244
242
  if accelerators_available >= min_quantity_filter:
245
243
  quantized_availability = min_quantity_filter * (
246
244
  accelerators_available // min_quantity_filter)
247
- total_accelerators_available[
248
- accelerator_name] += quantized_availability
245
+ total_accelerators_available[accelerator_name] = (
246
+ total_accelerators_available.get(accelerator_name, 0) +
247
+ quantized_availability)
249
248
 
250
249
  result = []
251
250
 
sky/core.py CHANGED
@@ -268,7 +268,8 @@ def _start(
268
268
  cluster_status, handle = backend_utils.refresh_cluster_status_handle(
269
269
  cluster_name)
270
270
  if handle is None:
271
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
271
+ raise exceptions.ClusterDoesNotExist(
272
+ f'Cluster {cluster_name!r} does not exist.')
272
273
  if not force and cluster_status == status_lib.ClusterStatus.UP:
273
274
  sky_logging.print(f'Cluster {cluster_name!r} is already up.')
274
275
  return handle
@@ -359,12 +360,13 @@ def start(
359
360
  Useful for upgrading SkyPilot runtime.
360
361
 
361
362
  Raises:
362
- ValueError: argument values are invalid: (1) the specified cluster does
363
- not exist; (2) if ``down`` is set to True but
364
- ``idle_minutes_to_autostop`` is None; (3) if the specified cluster is
365
- the managed jobs controller, and either ``idle_minutes_to_autostop``
366
- is not None or ``down`` is True (omit them to use the default
367
- autostop settings).
363
+ ValueError: argument values are invalid: (1) if ``down`` is set to True
364
+ but ``idle_minutes_to_autostop`` is None; (2) if the specified
365
+ cluster is the managed jobs controller, and either
366
+ ``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
367
+ them to use the default autostop settings).
368
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
369
+ exist.
368
370
  sky.exceptions.NotSupportedError: if the cluster to restart was
369
371
  launched using a non-default backend that does not support this
370
372
  operation.
@@ -412,7 +414,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
412
414
  related resources.
413
415
 
414
416
  Raises:
415
- ValueError: the specified cluster does not exist.
417
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
418
+ exist.
416
419
  RuntimeError: failed to stop the cluster.
417
420
  sky.exceptions.NotSupportedError: if the specified cluster is a spot
418
421
  cluster, or a TPU VM Pod cluster, or the managed jobs controller.
@@ -423,7 +426,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
423
426
  f'is not supported.')
424
427
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
425
428
  if handle is None:
426
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
429
+ raise exceptions.ClusterDoesNotExist(
430
+ f'Cluster {cluster_name!r} does not exist.')
427
431
 
428
432
  backend = backend_utils.get_backend_from_handle(handle)
429
433
 
@@ -467,14 +471,16 @@ def down(cluster_name: str, purge: bool = False) -> None:
467
471
  resources.
468
472
 
469
473
  Raises:
470
- ValueError: the specified cluster does not exist.
474
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
475
+ exist.
471
476
  RuntimeError: failed to tear down the cluster.
472
477
  sky.exceptions.NotSupportedError: the specified cluster is the managed
473
478
  jobs controller.
474
479
  """
475
480
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
476
481
  if handle is None:
477
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
482
+ raise exceptions.ClusterDoesNotExist(
483
+ f'Cluster {cluster_name!r} does not exist.')
478
484
 
479
485
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
480
486
  backend = backend_utils.get_backend_from_handle(handle)
@@ -521,7 +527,7 @@ def autostop(
521
527
  rather than autostop (restartable).
522
528
 
523
529
  Raises:
524
- ValueError: if the cluster does not exist.
530
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
525
531
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
526
532
  sky.exceptions.NotSupportedError: if the cluster is not based on
527
533
  CloudVmRayBackend or the cluster is TPU VM Pod.
@@ -615,7 +621,7 @@ def queue(cluster_name: str,
615
621
  }
616
622
  ]
617
623
  raises:
618
- ValueError: if the cluster does not exist.
624
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
619
625
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
620
626
  sky.exceptions.NotSupportedError: if the cluster is not based on
621
627
  CloudVmRayBackend.
@@ -674,7 +680,8 @@ def cancel(
674
680
  worker node is preempted in the spot cluster.
675
681
 
676
682
  Raises:
677
- ValueError: if arguments are invalid, or the cluster does not exist.
683
+ ValueError: if arguments are invalid.
684
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
678
685
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
679
686
  sky.exceptions.NotSupportedError: if the specified cluster is a
680
687
  controller that does not support this operation.
@@ -750,8 +757,8 @@ def tail_logs(cluster_name: str,
750
757
  Please refer to the sky.cli.tail_logs for the document.
751
758
 
752
759
  Raises:
753
- ValueError: arguments are invalid or the cluster is not supported or
754
- the cluster does not exist.
760
+ ValueError: if arguments are invalid or the cluster is not supported.
761
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
755
762
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
756
763
  sky.exceptions.NotSupportedError: if the cluster is not based on
757
764
  CloudVmRayBackend.
@@ -793,7 +800,7 @@ def download_logs(
793
800
  Returns:
794
801
  Dict[str, str]: a mapping of job_id to local log path.
795
802
  Raises:
796
- ValueError: if the cluster does not exist.
803
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
797
804
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
798
805
  sky.exceptions.NotSupportedError: if the cluster is not based on
799
806
  CloudVmRayBackend.
@@ -838,7 +845,7 @@ def job_status(cluster_name: str,
838
845
  If job_ids is None and there is no job on the cluster, it will return
839
846
  {None: None}.
840
847
  Raises:
841
- ValueError: if the cluster does not exist.
848
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
842
849
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
843
850
  sky.exceptions.NotSupportedError: if the cluster is not based on
844
851
  CloudVmRayBackend.
sky/exceptions.py CHANGED
@@ -132,6 +132,13 @@ class ClusterSetUpError(Exception):
132
132
  pass
133
133
 
134
134
 
135
+ class ClusterDoesNotExist(ValueError):
136
+ """Raise when trying to operate on a cluster that does not exist."""
137
+ # This extends ValueError for compatibility reasons - we used to throw
138
+ # ValueError instead of this.
139
+ pass
140
+
141
+
135
142
  class NotSupportedError(Exception):
136
143
  """Raised when a feature is not supported."""
137
144
  pass
sky/execution.py CHANGED
@@ -581,8 +581,9 @@ def exec( # pylint: disable=redefined-builtin
581
581
  submitted.
582
582
 
583
583
  Raises:
584
- ValueError: if the specified cluster does not exist or is not in UP
585
- status.
584
+ ValueError: if the specified cluster is not in UP status.
585
+ sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
586
+ exist.
586
587
  sky.exceptions.NotSupportedError: if the specified cluster is a
587
588
  controller that does not support this operation.
588
589
 
sky/jobs/controller.py CHANGED
@@ -6,7 +6,7 @@ import pathlib
6
6
  import time
7
7
  import traceback
8
8
  import typing
9
- from typing import Tuple
9
+ from typing import Optional, Tuple
10
10
 
11
11
  import filelock
12
12
 
@@ -87,18 +87,28 @@ class JobsController:
87
87
  task.update_envs(task_envs)
88
88
 
89
89
  def _download_log_and_stream(
90
- self,
91
- handle: cloud_vm_ray_backend.CloudVmRayResourceHandle) -> None:
92
- """Downloads and streams the logs of the latest job.
90
+ self, task_id: Optional[int],
91
+ handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
92
+ ) -> None:
93
+ """Downloads and streams the logs of the current job with given task ID.
93
94
 
94
95
  We do not stream the logs from the cluster directly, as the
95
96
  donwload and stream should be faster, and more robust against
96
97
  preemptions or ssh disconnection during the streaming.
97
98
  """
99
+ if handle is None:
100
+ logger.info(f'Cluster for job {self._job_id} is not found. '
101
+ 'Skipping downloading and streaming the logs.')
102
+ return
98
103
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
99
104
  'managed_jobs')
100
- controller_utils.download_and_stream_latest_job_log(
105
+ log_file = controller_utils.download_and_stream_latest_job_log(
101
106
  self._backend, handle, managed_job_logs_dir)
107
+ if log_file is not None:
108
+ # Set the path of the log file for the current task, so it can be
109
+ # accessed even after the job is finished
110
+ managed_job_state.set_local_log_file(self._job_id, task_id,
111
+ log_file)
102
112
  logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
103
113
 
104
114
  def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
@@ -213,7 +223,8 @@ class JobsController:
213
223
  if job_status == job_lib.JobStatus.SUCCEEDED:
214
224
  end_time = managed_job_utils.get_job_timestamp(
215
225
  self._backend, cluster_name, get_end_time=True)
216
- # The job is done.
226
+ # The job is done. Set the job to SUCCEEDED first before start
227
+ # downloading and streaming the logs to make it more responsive.
217
228
  managed_job_state.set_succeeded(self._job_id,
218
229
  task_id,
219
230
  end_time=end_time,
@@ -221,12 +232,21 @@ class JobsController:
221
232
  logger.info(
222
233
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
223
234
  f'Cleaning up the cluster {cluster_name}.')
235
+ clusters = backend_utils.get_clusters(
236
+ cluster_names=[cluster_name],
237
+ refresh=False,
238
+ include_controller=False)
239
+ if clusters:
240
+ assert len(clusters) == 1, (clusters, cluster_name)
241
+ handle = clusters[0].get('handle')
242
+ # Best effort to download and stream the logs.
243
+ self._download_log_and_stream(task_id, handle)
224
244
  # Only clean up the cluster, not the storages, because tasks may
225
245
  # share storages.
226
246
  recovery_strategy.terminate_cluster(cluster_name=cluster_name)
227
247
  return True
228
248
 
229
- # For single-node jobs, nonterminated job_status indicates a
249
+ # For single-node jobs, non-terminated job_status indicates a
230
250
  # healthy cluster. We can safely continue monitoring.
231
251
  # For multi-node jobs, since the job may not be set to FAILED
232
252
  # immediately (depending on user program) when only some of the
@@ -278,7 +298,7 @@ class JobsController:
278
298
  'The user job failed. Please check the logs below.\n'
279
299
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
280
300
 
281
- self._download_log_and_stream(handle)
301
+ self._download_log_and_stream(task_id, handle)
282
302
  managed_job_status = (
283
303
  managed_job_state.ManagedJobStatus.FAILED)
284
304
  if job_status == job_lib.JobStatus.FAILED_SETUP:
sky/jobs/core.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import os
3
3
  import tempfile
4
+ import typing
4
5
  from typing import Any, Dict, List, Optional, Union
5
6
  import uuid
6
7
 
@@ -29,6 +30,9 @@ from sky.utils import subprocess_utils
29
30
  from sky.utils import timeline
30
31
  from sky.utils import ux_utils
31
32
 
33
+ if typing.TYPE_CHECKING:
34
+ from sky.backends import cloud_vm_ray_backend
35
+
32
36
 
33
37
  @timeline.event
34
38
  @usage_lib.entrypoint
@@ -225,6 +229,40 @@ def queue_from_kubernetes_pod(
225
229
  return jobs
226
230
 
227
231
 
232
+ def _maybe_restart_controller(
233
+ refresh: bool, stopped_message: str, spinner_message: str
234
+ ) -> 'cloud_vm_ray_backend.CloudVmRayResourceHandle':
235
+ """Restart controller if refresh is True and it is stopped."""
236
+ jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
237
+ if refresh:
238
+ stopped_message = ''
239
+ try:
240
+ handle = backend_utils.is_controller_accessible(
241
+ controller=jobs_controller_type, stopped_message=stopped_message)
242
+ except exceptions.ClusterNotUpError as e:
243
+ if not refresh:
244
+ raise
245
+ handle = None
246
+ controller_status = e.cluster_status
247
+
248
+ if handle is not None:
249
+ return handle
250
+
251
+ sky_logging.print(f'{colorama.Fore.YELLOW}'
252
+ f'Restarting {jobs_controller_type.value.name}...'
253
+ f'{colorama.Style.RESET_ALL}')
254
+
255
+ rich_utils.force_update_status(
256
+ ux_utils.spinner_message(f'{spinner_message} - restarting '
257
+ 'controller'))
258
+ handle = sky.start(jobs_controller_type.value.cluster_name)
259
+ controller_status = status_lib.ClusterStatus.UP
260
+ rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
261
+
262
+ assert handle is not None, (controller_status, refresh)
263
+ return handle
264
+
265
+
228
266
  @usage_lib.entrypoint
229
267
  def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
230
268
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -252,34 +290,11 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
252
290
  does not exist.
253
291
  RuntimeError: if failed to get the managed jobs with ssh.
254
292
  """
255
- jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
256
- stopped_message = ''
257
- if not refresh:
258
- stopped_message = 'No in-progress managed jobs.'
259
- try:
260
- handle = backend_utils.is_controller_accessible(
261
- controller=jobs_controller_type, stopped_message=stopped_message)
262
- except exceptions.ClusterNotUpError as e:
263
- if not refresh:
264
- raise
265
- handle = None
266
- controller_status = e.cluster_status
267
-
268
- if refresh and handle is None:
269
- sky_logging.print(f'{colorama.Fore.YELLOW}'
270
- 'Restarting controller for latest status...'
271
- f'{colorama.Style.RESET_ALL}')
272
-
273
- rich_utils.force_update_status(
274
- ux_utils.spinner_message('Checking managed jobs - restarting '
275
- 'controller'))
276
- handle = sky.start(jobs_controller_type.value.cluster_name)
277
- controller_status = status_lib.ClusterStatus.UP
278
- rich_utils.force_update_status(
279
- ux_utils.spinner_message('Checking managed jobs'))
280
-
281
- assert handle is not None, (controller_status, refresh)
282
-
293
+ handle = _maybe_restart_controller(refresh,
294
+ stopped_message='No in-progress '
295
+ 'managed jobs.',
296
+ spinner_message='Checking '
297
+ 'managed jobs')
283
298
  backend = backend_utils.get_backend_from_handle(handle)
284
299
  assert isinstance(backend, backends.CloudVmRayBackend)
285
300
 
@@ -371,7 +386,7 @@ def cancel(name: Optional[str] = None,
371
386
 
372
387
  @usage_lib.entrypoint
373
388
  def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
374
- controller: bool) -> None:
389
+ controller: bool, refresh: bool) -> None:
375
390
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
376
391
  """Tail logs of managed jobs.
377
392
 
@@ -382,15 +397,26 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
382
397
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
383
398
  """
384
399
  # TODO(zhwu): Automatically restart the jobs controller
400
+ if name is not None and job_id is not None:
401
+ with ux_utils.print_exception_no_traceback():
402
+ raise ValueError('Cannot specify both name and job_id.')
403
+
385
404
  jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
386
- handle = backend_utils.is_controller_accessible(
387
- controller=jobs_controller_type,
405
+ job_name_or_id_str = ''
406
+ if job_id is not None:
407
+ job_name_or_id_str = str(job_id)
408
+ elif name is not None:
409
+ job_name_or_id_str = f'-n {name}'
410
+ else:
411
+ job_name_or_id_str = ''
412
+ handle = _maybe_restart_controller(
413
+ refresh,
388
414
  stopped_message=(
389
- 'Please restart the jobs controller with '
390
- f'`sky start {jobs_controller_type.value.cluster_name}`.'))
415
+ f'{jobs_controller_type.value.name.capitalize()} is stopped. To '
416
+ f'get the logs, run: {colorama.Style.BRIGHT}sky jobs logs '
417
+ f'-r {job_name_or_id_str}{colorama.Style.RESET_ALL}'),
418
+ spinner_message='Retrieving job logs')
391
419
 
392
- if name is not None and job_id is not None:
393
- raise ValueError('Cannot specify both name and job_id.')
394
420
  backend = backend_utils.get_backend_from_handle(handle)
395
421
  assert isinstance(backend, backends.CloudVmRayBackend), backend
396
422
 
@@ -50,8 +50,9 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
50
50
  usage_lib.messages.usage.set_internal()
51
51
  sky.down(cluster_name)
52
52
  return
53
- except ValueError:
53
+ except exceptions.ClusterDoesNotExist:
54
54
  # The cluster is already down.
55
+ logger.debug(f'The cluster {cluster_name} is already down.')
55
56
  return
56
57
  except Exception as e: # pylint: disable=broad-except
57
58
  retry_cnt += 1
sky/jobs/state.py CHANGED
@@ -66,7 +66,8 @@ def create_table(cursor, conn):
66
66
  spot_job_id INTEGER,
67
67
  task_id INTEGER DEFAULT 0,
68
68
  task_name TEXT,
69
- specs TEXT)""")
69
+ specs TEXT,
70
+ local_log_file TEXT DEFAULT NULL)""")
70
71
  conn.commit()
71
72
 
72
73
  db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
@@ -103,6 +104,8 @@ def create_table(cursor, conn):
103
104
  value_to_replace_existing_entries=json.dumps({
104
105
  'max_restarts_on_errors': 0,
105
106
  }))
107
+ db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
108
+ 'TEXT DEFAULT NULL')
106
109
 
107
110
  # `job_info` contains the mapping from job_id to the job_name.
108
111
  # In the future, it may contain more information about each job.
@@ -157,6 +160,7 @@ columns = [
157
160
  'task_id',
158
161
  'task_name',
159
162
  'specs',
163
+ 'local_log_file',
160
164
  # columns from the job_info table
161
165
  '_job_info_job_id', # This should be the same as job_id
162
166
  'job_name',
@@ -512,6 +516,20 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
512
516
  callback_func('CANCELLED')
513
517
 
514
518
 
519
+ def set_local_log_file(job_id: int, task_id: Optional[int],
520
+ local_log_file: str):
521
+ """Set the local log file for a job."""
522
+ filter_str = 'spot_job_id=(?)'
523
+ filter_args = [local_log_file, job_id]
524
+ if task_id is not None:
525
+ filter_str += ' AND task_id=(?)'
526
+ filter_args.append(task_id)
527
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
528
+ cursor.execute(
529
+ 'UPDATE spot SET local_log_file=(?) '
530
+ f'WHERE {filter_str}', filter_args)
531
+
532
+
515
533
  # ======== utility functions ========
516
534
  def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
517
535
  """Get non-terminal job ids by name."""
@@ -662,3 +680,17 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
662
680
  WHERE spot_job_id=(?) AND task_id=(?)""",
663
681
  (job_id, task_id)).fetchone()
664
682
  return json.loads(task_specs[0])
683
+
684
+
685
+ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
686
+ """Get the local log directory for a job."""
687
+ filter_str = 'spot_job_id=(?)'
688
+ filter_args = [job_id]
689
+ if task_id is not None:
690
+ filter_str += ' AND task_id=(?)'
691
+ filter_args.append(task_id)
692
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
693
+ local_log_file = cursor.execute(
694
+ f'SELECT local_log_file FROM spot '
695
+ f'WHERE {filter_str}', filter_args).fetchone()
696
+ return local_log_file[-1] if local_log_file else None
sky/jobs/utils.py CHANGED
@@ -327,10 +327,24 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
327
327
  if managed_job_status.is_failed():
328
328
  job_msg = ('\nFailure reason: '
329
329
  f'{managed_job_state.get_failure_reason(job_id)}')
330
+ log_file = managed_job_state.get_local_log_file(job_id, None)
331
+ if log_file is not None:
332
+ with open(log_file, 'r', encoding='utf-8') as f:
333
+ # Stream the logs to the console without reading the whole
334
+ # file into memory.
335
+ start_streaming = False
336
+ for line in f:
337
+ if log_lib.LOG_FILE_START_STREAMING_AT in line:
338
+ start_streaming = True
339
+ if start_streaming:
340
+ print(line, end='', flush=True)
341
+ return ''
330
342
  return (f'{colorama.Fore.YELLOW}'
331
343
  f'Job {job_id} is already in terminal state '
332
- f'{managed_job_status.value}. Logs will not be shown.'
333
- f'{colorama.Style.RESET_ALL}{job_msg}')
344
+ f'{managed_job_status.value}. For more details, run: '
345
+ f'sky jobs logs --controller {job_id}'
346
+ f'{colorama.Style.RESET_ALL}'
347
+ f'{job_msg}')
334
348
  backend = backends.CloudVmRayBackend()
335
349
  task_id, managed_job_status = (
336
350
  managed_job_state.get_latest_task_id_status(job_id))