skypilot-nightly 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/core.py CHANGED
@@ -268,7 +268,8 @@ def _start(
268
268
  cluster_status, handle = backend_utils.refresh_cluster_status_handle(
269
269
  cluster_name)
270
270
  if handle is None:
271
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
271
+ raise exceptions.ClusterDoesNotExist(
272
+ f'Cluster {cluster_name!r} does not exist.')
272
273
  if not force and cluster_status == status_lib.ClusterStatus.UP:
273
274
  sky_logging.print(f'Cluster {cluster_name!r} is already up.')
274
275
  return handle
@@ -359,12 +360,13 @@ def start(
359
360
  Useful for upgrading SkyPilot runtime.
360
361
 
361
362
  Raises:
362
- ValueError: argument values are invalid: (1) the specified cluster does
363
- not exist; (2) if ``down`` is set to True but
364
- ``idle_minutes_to_autostop`` is None; (3) if the specified cluster is
365
- the managed jobs controller, and either ``idle_minutes_to_autostop``
366
- is not None or ``down`` is True (omit them to use the default
367
- autostop settings).
363
+ ValueError: argument values are invalid: (1) if ``down`` is set to True
364
+ but ``idle_minutes_to_autostop`` is None; (2) if the specified
365
+ cluster is the managed jobs controller, and either
366
+ ``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
367
+ them to use the default autostop settings).
368
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
369
+ exist.
368
370
  sky.exceptions.NotSupportedError: if the cluster to restart was
369
371
  launched using a non-default backend that does not support this
370
372
  operation.
@@ -412,7 +414,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
412
414
  related resources.
413
415
 
414
416
  Raises:
415
- ValueError: the specified cluster does not exist.
417
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
418
+ exist.
416
419
  RuntimeError: failed to stop the cluster.
417
420
  sky.exceptions.NotSupportedError: if the specified cluster is a spot
418
421
  cluster, or a TPU VM Pod cluster, or the managed jobs controller.
@@ -423,7 +426,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
423
426
  f'is not supported.')
424
427
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
425
428
  if handle is None:
426
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
429
+ raise exceptions.ClusterDoesNotExist(
430
+ f'Cluster {cluster_name!r} does not exist.')
427
431
 
428
432
  backend = backend_utils.get_backend_from_handle(handle)
429
433
 
@@ -467,14 +471,16 @@ def down(cluster_name: str, purge: bool = False) -> None:
467
471
  resources.
468
472
 
469
473
  Raises:
470
- ValueError: the specified cluster does not exist.
474
+ sky.exceptions.ClusterDoesNotExist: the specified cluster does not
475
+ exist.
471
476
  RuntimeError: failed to tear down the cluster.
472
477
  sky.exceptions.NotSupportedError: the specified cluster is the managed
473
478
  jobs controller.
474
479
  """
475
480
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
476
481
  if handle is None:
477
- raise ValueError(f'Cluster {cluster_name!r} does not exist.')
482
+ raise exceptions.ClusterDoesNotExist(
483
+ f'Cluster {cluster_name!r} does not exist.')
478
484
 
479
485
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
480
486
  backend = backend_utils.get_backend_from_handle(handle)
@@ -521,7 +527,7 @@ def autostop(
521
527
  rather than autostop (restartable).
522
528
 
523
529
  Raises:
524
- ValueError: if the cluster does not exist.
530
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
525
531
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
526
532
  sky.exceptions.NotSupportedError: if the cluster is not based on
527
533
  CloudVmRayBackend or the cluster is TPU VM Pod.
@@ -615,7 +621,7 @@ def queue(cluster_name: str,
615
621
  }
616
622
  ]
617
623
  raises:
618
- ValueError: if the cluster does not exist.
624
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
619
625
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
620
626
  sky.exceptions.NotSupportedError: if the cluster is not based on
621
627
  CloudVmRayBackend.
@@ -674,7 +680,8 @@ def cancel(
674
680
  worker node is preempted in the spot cluster.
675
681
 
676
682
  Raises:
677
- ValueError: if arguments are invalid, or the cluster does not exist.
683
+ ValueError: if arguments are invalid.
684
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
678
685
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
679
686
  sky.exceptions.NotSupportedError: if the specified cluster is a
680
687
  controller that does not support this operation.
@@ -750,8 +757,8 @@ def tail_logs(cluster_name: str,
750
757
  Please refer to the sky.cli.tail_logs for the document.
751
758
 
752
759
  Raises:
753
- ValueError: arguments are invalid or the cluster is not supported or
754
- the cluster does not exist.
760
+ ValueError: if arguments are invalid or the cluster is not supported.
761
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
755
762
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
756
763
  sky.exceptions.NotSupportedError: if the cluster is not based on
757
764
  CloudVmRayBackend.
@@ -793,7 +800,7 @@ def download_logs(
793
800
  Returns:
794
801
  Dict[str, str]: a mapping of job_id to local log path.
795
802
  Raises:
796
- ValueError: if the cluster does not exist.
803
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
797
804
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
798
805
  sky.exceptions.NotSupportedError: if the cluster is not based on
799
806
  CloudVmRayBackend.
@@ -838,7 +845,7 @@ def job_status(cluster_name: str,
838
845
  If job_ids is None and there is no job on the cluster, it will return
839
846
  {None: None}.
840
847
  Raises:
841
- ValueError: if the cluster does not exist.
848
+ sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
842
849
  sky.exceptions.ClusterNotUpError: if the cluster is not UP.
843
850
  sky.exceptions.NotSupportedError: if the cluster is not based on
844
851
  CloudVmRayBackend.
sky/exceptions.py CHANGED
@@ -132,6 +132,13 @@ class ClusterSetUpError(Exception):
132
132
  pass
133
133
 
134
134
 
135
+ class ClusterDoesNotExist(ValueError):
136
+ """Raise when trying to operate on a cluster that does not exist."""
137
+ # This extends ValueError for compatibility reasons - we used to throw
138
+ # ValueError instead of this.
139
+ pass
140
+
141
+
135
142
  class NotSupportedError(Exception):
136
143
  """Raised when a feature is not supported."""
137
144
  pass
sky/execution.py CHANGED
@@ -108,6 +108,7 @@ def _execute(
108
108
  idle_minutes_to_autostop: Optional[int] = None,
109
109
  no_setup: bool = False,
110
110
  clone_disk_from: Optional[str] = None,
111
+ skip_unnecessary_provisioning: bool = False,
111
112
  # Internal only:
112
113
  # pylint: disable=invalid-name
113
114
  _is_launched_by_jobs_controller: bool = False,
@@ -128,8 +129,9 @@ def _execute(
128
129
  Note that if errors occur during provisioning/data syncing/setting up,
129
130
  the cluster will not be torn down for debugging purposes.
130
131
  stream_logs: bool; whether to stream all tasks' outputs to the client.
131
- handle: Optional[backends.ResourceHandle]; if provided, execution will use
132
- an existing backend cluster handle instead of provisioning a new one.
132
+ handle: Optional[backends.ResourceHandle]; if provided, execution will
133
+ attempt to use an existing backend cluster handle instead of
134
+ provisioning a new one.
133
135
  backend: Backend; backend to use for executing the tasks. Defaults to
134
136
  CloudVmRayBackend()
135
137
  retry_until_up: bool; whether to retry the provisioning until the cluster
@@ -150,6 +152,11 @@ def _execute(
150
152
  idle_minutes_to_autostop: int; if provided, the cluster will be set to
151
153
  autostop after this many minutes of idleness.
152
154
  no_setup: bool; whether to skip setup commands or not when (re-)launching.
155
+ clone_disk_from: Optional[str]; if set, clone the disk from the specified
156
+ cluster.
157
+ skip_unecessary_provisioning: bool; if True, compare the calculated
158
+ cluster config to the current cluster's config. If they match, shortcut
159
+ provisioning even if we have Stage.PROVISION.
153
160
 
154
161
  Returns:
155
162
  job_id: Optional[int]; the job ID of the submitted job. None if the
@@ -288,13 +295,18 @@ def _execute(
288
295
 
289
296
  try:
290
297
  if Stage.PROVISION in stages:
291
- if handle is None:
292
- handle = backend.provision(task,
293
- task.best_resources,
294
- dryrun=dryrun,
295
- stream_logs=stream_logs,
296
- cluster_name=cluster_name,
297
- retry_until_up=retry_until_up)
298
+ assert handle is None or skip_unnecessary_provisioning, (
299
+ 'Provisioning requested, but handle is already set. PROVISION '
300
+ 'should be excluded from stages or '
301
+ 'skip_unecessary_provisioning should be set. ')
302
+ handle = backend.provision(
303
+ task,
304
+ task.best_resources,
305
+ dryrun=dryrun,
306
+ stream_logs=stream_logs,
307
+ cluster_name=cluster_name,
308
+ retry_until_up=retry_until_up,
309
+ skip_unnecessary_provisioning=skip_unnecessary_provisioning)
298
310
 
299
311
  if handle is None:
300
312
  assert dryrun, ('If not dryrun, handle must be set or '
@@ -469,6 +481,7 @@ def launch(
469
481
 
470
482
  handle = None
471
483
  stages = None
484
+ skip_unnecessary_provisioning = False
472
485
  # Check if cluster exists and we are doing fast provisioning
473
486
  if fast and cluster_name is not None:
474
487
  cluster_status, maybe_handle = (
@@ -502,12 +515,16 @@ def launch(
502
515
  if cluster_status == status_lib.ClusterStatus.UP:
503
516
  handle = maybe_handle
504
517
  stages = [
518
+ # Provisioning will be short-circuited if the existing
519
+ # cluster config hash matches the calculated one.
520
+ Stage.PROVISION,
505
521
  Stage.SYNC_WORKDIR,
506
522
  Stage.SYNC_FILE_MOUNTS,
507
523
  Stage.PRE_EXEC,
508
524
  Stage.EXEC,
509
525
  Stage.DOWN,
510
526
  ]
527
+ skip_unnecessary_provisioning = True
511
528
 
512
529
  return _execute(
513
530
  entrypoint=entrypoint,
@@ -525,6 +542,7 @@ def launch(
525
542
  idle_minutes_to_autostop=idle_minutes_to_autostop,
526
543
  no_setup=no_setup,
527
544
  clone_disk_from=clone_disk_from,
545
+ skip_unnecessary_provisioning=skip_unnecessary_provisioning,
528
546
  _is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
529
547
  _is_launched_by_sky_serve_controller=
530
548
  _is_launched_by_sky_serve_controller,
@@ -581,8 +599,9 @@ def exec( # pylint: disable=redefined-builtin
581
599
  submitted.
582
600
 
583
601
  Raises:
584
- ValueError: if the specified cluster does not exist or is not in UP
585
- status.
602
+ ValueError: if the specified cluster is not in UP status.
603
+ sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
604
+ exist.
586
605
  sky.exceptions.NotSupportedError: if the specified cluster is a
587
606
  controller that does not support this operation.
588
607
 
sky/global_user_state.py CHANGED
@@ -61,7 +61,8 @@ def create_table(cursor, conn):
61
61
  cluster_hash TEXT DEFAULT null,
62
62
  storage_mounts_metadata BLOB DEFAULT null,
63
63
  cluster_ever_up INTEGER DEFAULT 0,
64
- status_updated_at INTEGER DEFAULT null)""")
64
+ status_updated_at INTEGER DEFAULT null,
65
+ config_hash TEXT DEFAULT null)""")
65
66
 
66
67
  # Table for Cluster History
67
68
  # usage_intervals: List[Tuple[int, int]]
@@ -135,6 +136,9 @@ def create_table(cursor, conn):
135
136
  db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
136
137
  'INTEGER DEFAULT null')
137
138
 
139
+ db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
140
+ 'TEXT DEFAULT null')
141
+
138
142
  conn.commit()
139
143
 
140
144
 
@@ -145,7 +149,8 @@ def add_or_update_cluster(cluster_name: str,
145
149
  cluster_handle: 'backends.ResourceHandle',
146
150
  requested_resources: Optional[Set[Any]],
147
151
  ready: bool,
148
- is_launch: bool = True):
152
+ is_launch: bool = True,
153
+ config_hash: Optional[str] = None):
149
154
  """Adds or updates cluster_name -> cluster_handle mapping.
150
155
 
151
156
  Args:
@@ -197,7 +202,8 @@ def add_or_update_cluster(cluster_name: str,
197
202
  # specified.
198
203
  '(name, launched_at, handle, last_use, status, '
199
204
  'autostop, to_down, metadata, owner, cluster_hash, '
200
- 'storage_mounts_metadata, cluster_ever_up, status_updated_at) '
205
+ 'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
206
+ 'config_hash) '
201
207
  'VALUES ('
202
208
  # name
203
209
  '?, '
@@ -236,7 +242,9 @@ def add_or_update_cluster(cluster_name: str,
236
242
  # cluster_ever_up
237
243
  '((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
238
244
  # status_updated_at
239
- '?'
245
+ '?,'
246
+ # config_hash
247
+ 'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?))'
240
248
  ')',
241
249
  (
242
250
  # name
@@ -270,6 +278,9 @@ def add_or_update_cluster(cluster_name: str,
270
278
  int(ready),
271
279
  # status_updated_at
272
280
  status_updated_at,
281
+ # config_hash
282
+ config_hash,
283
+ cluster_name,
273
284
  ))
274
285
 
275
286
  launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
@@ -585,15 +596,15 @@ def get_cluster_from_name(
585
596
  rows = _DB.cursor.execute(
586
597
  'SELECT name, launched_at, handle, last_use, status, autostop, '
587
598
  'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
588
- 'cluster_ever_up, status_updated_at FROM clusters WHERE name=(?)',
589
- (cluster_name,)).fetchall()
599
+ 'cluster_ever_up, status_updated_at, config_hash '
600
+ 'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
590
601
  for row in rows:
591
602
  # Explicitly specify the number of fields to unpack, so that
592
603
  # we can add new fields to the database in the future without
593
604
  # breaking the previous code.
594
605
  (name, launched_at, handle, last_use, status, autostop, metadata,
595
606
  to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
596
- status_updated_at) = row[:13]
607
+ status_updated_at, config_hash) = row[:14]
597
608
  # TODO: use namedtuple instead of dict
598
609
  record = {
599
610
  'name': name,
@@ -610,6 +621,7 @@ def get_cluster_from_name(
610
621
  _load_storage_mounts_metadata(storage_mounts_metadata),
611
622
  'cluster_ever_up': bool(cluster_ever_up),
612
623
  'status_updated_at': status_updated_at,
624
+ 'config_hash': config_hash,
613
625
  }
614
626
  return record
615
627
  return None
@@ -619,13 +631,13 @@ def get_clusters() -> List[Dict[str, Any]]:
619
631
  rows = _DB.cursor.execute(
620
632
  'select name, launched_at, handle, last_use, status, autostop, '
621
633
  'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
622
- 'cluster_ever_up, status_updated_at from clusters '
623
- 'order by launched_at desc').fetchall()
634
+ 'cluster_ever_up, status_updated_at, config_hash '
635
+ 'from clusters order by launched_at desc').fetchall()
624
636
  records = []
625
637
  for row in rows:
626
638
  (name, launched_at, handle, last_use, status, autostop, metadata,
627
639
  to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
628
- status_updated_at) = row[:13]
640
+ status_updated_at, config_hash) = row[:14]
629
641
  # TODO: use namedtuple instead of dict
630
642
  record = {
631
643
  'name': name,
@@ -642,6 +654,7 @@ def get_clusters() -> List[Dict[str, Any]]:
642
654
  _load_storage_mounts_metadata(storage_mounts_metadata),
643
655
  'cluster_ever_up': bool(cluster_ever_up),
644
656
  'status_updated_at': status_updated_at,
657
+ 'config_hash': config_hash,
645
658
  }
646
659
 
647
660
  records.append(record)
sky/jobs/controller.py CHANGED
@@ -6,7 +6,7 @@ import pathlib
6
6
  import time
7
7
  import traceback
8
8
  import typing
9
- from typing import Tuple
9
+ from typing import Optional, Tuple
10
10
 
11
11
  import filelock
12
12
 
@@ -87,18 +87,28 @@ class JobsController:
87
87
  task.update_envs(task_envs)
88
88
 
89
89
  def _download_log_and_stream(
90
- self,
91
- handle: cloud_vm_ray_backend.CloudVmRayResourceHandle) -> None:
92
- """Downloads and streams the logs of the latest job.
90
+ self, task_id: Optional[int],
91
+ handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
92
+ ) -> None:
93
+ """Downloads and streams the logs of the current job with given task ID.
93
94
 
94
95
  We do not stream the logs from the cluster directly, as the
95
96
  donwload and stream should be faster, and more robust against
96
97
  preemptions or ssh disconnection during the streaming.
97
98
  """
99
+ if handle is None:
100
+ logger.info(f'Cluster for job {self._job_id} is not found. '
101
+ 'Skipping downloading and streaming the logs.')
102
+ return
98
103
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
99
104
  'managed_jobs')
100
- controller_utils.download_and_stream_latest_job_log(
105
+ log_file = controller_utils.download_and_stream_latest_job_log(
101
106
  self._backend, handle, managed_job_logs_dir)
107
+ if log_file is not None:
108
+ # Set the path of the log file for the current task, so it can be
109
+ # accessed even after the job is finished
110
+ managed_job_state.set_local_log_file(self._job_id, task_id,
111
+ log_file)
102
112
  logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
103
113
 
104
114
  def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
@@ -213,7 +223,8 @@ class JobsController:
213
223
  if job_status == job_lib.JobStatus.SUCCEEDED:
214
224
  end_time = managed_job_utils.get_job_timestamp(
215
225
  self._backend, cluster_name, get_end_time=True)
216
- # The job is done.
226
+ # The job is done. Set the job to SUCCEEDED first before start
227
+ # downloading and streaming the logs to make it more responsive.
217
228
  managed_job_state.set_succeeded(self._job_id,
218
229
  task_id,
219
230
  end_time=end_time,
@@ -221,12 +232,21 @@ class JobsController:
221
232
  logger.info(
222
233
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
223
234
  f'Cleaning up the cluster {cluster_name}.')
235
+ clusters = backend_utils.get_clusters(
236
+ cluster_names=[cluster_name],
237
+ refresh=False,
238
+ include_controller=False)
239
+ if clusters:
240
+ assert len(clusters) == 1, (clusters, cluster_name)
241
+ handle = clusters[0].get('handle')
242
+ # Best effort to download and stream the logs.
243
+ self._download_log_and_stream(task_id, handle)
224
244
  # Only clean up the cluster, not the storages, because tasks may
225
245
  # share storages.
226
246
  recovery_strategy.terminate_cluster(cluster_name=cluster_name)
227
247
  return True
228
248
 
229
- # For single-node jobs, nonterminated job_status indicates a
249
+ # For single-node jobs, non-terminated job_status indicates a
230
250
  # healthy cluster. We can safely continue monitoring.
231
251
  # For multi-node jobs, since the job may not be set to FAILED
232
252
  # immediately (depending on user program) when only some of the
@@ -278,7 +298,7 @@ class JobsController:
278
298
  'The user job failed. Please check the logs below.\n'
279
299
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
280
300
 
281
- self._download_log_and_stream(handle)
301
+ self._download_log_and_stream(task_id, handle)
282
302
  managed_job_status = (
283
303
  managed_job_state.ManagedJobStatus.FAILED)
284
304
  if job_status == job_lib.JobStatus.FAILED_SETUP:
sky/jobs/core.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """SDK functions for managed jobs."""
2
2
  import os
3
3
  import tempfile
4
+ import typing
4
5
  from typing import Any, Dict, List, Optional, Union
5
6
  import uuid
6
7
 
@@ -29,6 +30,9 @@ from sky.utils import subprocess_utils
29
30
  from sky.utils import timeline
30
31
  from sky.utils import ux_utils
31
32
 
33
+ if typing.TYPE_CHECKING:
34
+ from sky.backends import cloud_vm_ray_backend
35
+
32
36
 
33
37
  @timeline.event
34
38
  @usage_lib.entrypoint
@@ -225,6 +229,40 @@ def queue_from_kubernetes_pod(
225
229
  return jobs
226
230
 
227
231
 
232
+ def _maybe_restart_controller(
233
+ refresh: bool, stopped_message: str, spinner_message: str
234
+ ) -> 'cloud_vm_ray_backend.CloudVmRayResourceHandle':
235
+ """Restart controller if refresh is True and it is stopped."""
236
+ jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
237
+ if refresh:
238
+ stopped_message = ''
239
+ try:
240
+ handle = backend_utils.is_controller_accessible(
241
+ controller=jobs_controller_type, stopped_message=stopped_message)
242
+ except exceptions.ClusterNotUpError as e:
243
+ if not refresh:
244
+ raise
245
+ handle = None
246
+ controller_status = e.cluster_status
247
+
248
+ if handle is not None:
249
+ return handle
250
+
251
+ sky_logging.print(f'{colorama.Fore.YELLOW}'
252
+ f'Restarting {jobs_controller_type.value.name}...'
253
+ f'{colorama.Style.RESET_ALL}')
254
+
255
+ rich_utils.force_update_status(
256
+ ux_utils.spinner_message(f'{spinner_message} - restarting '
257
+ 'controller'))
258
+ handle = sky.start(jobs_controller_type.value.cluster_name)
259
+ controller_status = status_lib.ClusterStatus.UP
260
+ rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
261
+
262
+ assert handle is not None, (controller_status, refresh)
263
+ return handle
264
+
265
+
228
266
  @usage_lib.entrypoint
229
267
  def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
230
268
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -252,34 +290,11 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
252
290
  does not exist.
253
291
  RuntimeError: if failed to get the managed jobs with ssh.
254
292
  """
255
- jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
256
- stopped_message = ''
257
- if not refresh:
258
- stopped_message = 'No in-progress managed jobs.'
259
- try:
260
- handle = backend_utils.is_controller_accessible(
261
- controller=jobs_controller_type, stopped_message=stopped_message)
262
- except exceptions.ClusterNotUpError as e:
263
- if not refresh:
264
- raise
265
- handle = None
266
- controller_status = e.cluster_status
267
-
268
- if refresh and handle is None:
269
- sky_logging.print(f'{colorama.Fore.YELLOW}'
270
- 'Restarting controller for latest status...'
271
- f'{colorama.Style.RESET_ALL}')
272
-
273
- rich_utils.force_update_status(
274
- ux_utils.spinner_message('Checking managed jobs - restarting '
275
- 'controller'))
276
- handle = sky.start(jobs_controller_type.value.cluster_name)
277
- controller_status = status_lib.ClusterStatus.UP
278
- rich_utils.force_update_status(
279
- ux_utils.spinner_message('Checking managed jobs'))
280
-
281
- assert handle is not None, (controller_status, refresh)
282
-
293
+ handle = _maybe_restart_controller(refresh,
294
+ stopped_message='No in-progress '
295
+ 'managed jobs.',
296
+ spinner_message='Checking '
297
+ 'managed jobs')
283
298
  backend = backend_utils.get_backend_from_handle(handle)
284
299
  assert isinstance(backend, backends.CloudVmRayBackend)
285
300
 
@@ -371,7 +386,7 @@ def cancel(name: Optional[str] = None,
371
386
 
372
387
  @usage_lib.entrypoint
373
388
  def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
374
- controller: bool) -> None:
389
+ controller: bool, refresh: bool) -> None:
375
390
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
376
391
  """Tail logs of managed jobs.
377
392
 
@@ -382,15 +397,26 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
382
397
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
383
398
  """
384
399
  # TODO(zhwu): Automatically restart the jobs controller
400
+ if name is not None and job_id is not None:
401
+ with ux_utils.print_exception_no_traceback():
402
+ raise ValueError('Cannot specify both name and job_id.')
403
+
385
404
  jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
386
- handle = backend_utils.is_controller_accessible(
387
- controller=jobs_controller_type,
405
+ job_name_or_id_str = ''
406
+ if job_id is not None:
407
+ job_name_or_id_str = str(job_id)
408
+ elif name is not None:
409
+ job_name_or_id_str = f'-n {name}'
410
+ else:
411
+ job_name_or_id_str = ''
412
+ handle = _maybe_restart_controller(
413
+ refresh,
388
414
  stopped_message=(
389
- 'Please restart the jobs controller with '
390
- f'`sky start {jobs_controller_type.value.cluster_name}`.'))
415
+ f'{jobs_controller_type.value.name.capitalize()} is stopped. To '
416
+ f'get the logs, run: {colorama.Style.BRIGHT}sky jobs logs '
417
+ f'-r {job_name_or_id_str}{colorama.Style.RESET_ALL}'),
418
+ spinner_message='Retrieving job logs')
391
419
 
392
- if name is not None and job_id is not None:
393
- raise ValueError('Cannot specify both name and job_id.')
394
420
  backend = backend_utils.get_backend_from_handle(handle)
395
421
  assert isinstance(backend, backends.CloudVmRayBackend), backend
396
422
 
@@ -50,8 +50,9 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
50
50
  usage_lib.messages.usage.set_internal()
51
51
  sky.down(cluster_name)
52
52
  return
53
- except ValueError:
53
+ except exceptions.ClusterDoesNotExist:
54
54
  # The cluster is already down.
55
+ logger.debug(f'The cluster {cluster_name} is already down.')
55
56
  return
56
57
  except Exception as e: # pylint: disable=broad-except
57
58
  retry_cnt += 1
sky/jobs/state.py CHANGED
@@ -66,7 +66,8 @@ def create_table(cursor, conn):
66
66
  spot_job_id INTEGER,
67
67
  task_id INTEGER DEFAULT 0,
68
68
  task_name TEXT,
69
- specs TEXT)""")
69
+ specs TEXT,
70
+ local_log_file TEXT DEFAULT NULL)""")
70
71
  conn.commit()
71
72
 
72
73
  db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
@@ -103,6 +104,8 @@ def create_table(cursor, conn):
103
104
  value_to_replace_existing_entries=json.dumps({
104
105
  'max_restarts_on_errors': 0,
105
106
  }))
107
+ db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
108
+ 'TEXT DEFAULT NULL')
106
109
 
107
110
  # `job_info` contains the mapping from job_id to the job_name.
108
111
  # In the future, it may contain more information about each job.
@@ -157,6 +160,7 @@ columns = [
157
160
  'task_id',
158
161
  'task_name',
159
162
  'specs',
163
+ 'local_log_file',
160
164
  # columns from the job_info table
161
165
  '_job_info_job_id', # This should be the same as job_id
162
166
  'job_name',
@@ -512,6 +516,20 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
512
516
  callback_func('CANCELLED')
513
517
 
514
518
 
519
+ def set_local_log_file(job_id: int, task_id: Optional[int],
520
+ local_log_file: str):
521
+ """Set the local log file for a job."""
522
+ filter_str = 'spot_job_id=(?)'
523
+ filter_args = [local_log_file, job_id]
524
+ if task_id is not None:
525
+ filter_str += ' AND task_id=(?)'
526
+ filter_args.append(task_id)
527
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
528
+ cursor.execute(
529
+ 'UPDATE spot SET local_log_file=(?) '
530
+ f'WHERE {filter_str}', filter_args)
531
+
532
+
515
533
  # ======== utility functions ========
516
534
  def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
517
535
  """Get non-terminal job ids by name."""
@@ -662,3 +680,17 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
662
680
  WHERE spot_job_id=(?) AND task_id=(?)""",
663
681
  (job_id, task_id)).fetchone()
664
682
  return json.loads(task_specs[0])
683
+
684
+
685
+ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
686
+ """Get the local log directory for a job."""
687
+ filter_str = 'spot_job_id=(?)'
688
+ filter_args = [job_id]
689
+ if task_id is not None:
690
+ filter_str += ' AND task_id=(?)'
691
+ filter_args.append(task_id)
692
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
693
+ local_log_file = cursor.execute(
694
+ f'SELECT local_log_file FROM spot '
695
+ f'WHERE {filter_str}', filter_args).fetchone()
696
+ return local_log_file[-1] if local_log_file else None