skypilot-nightly 1.0.0.dev20241203__py3-none-any.whl → 1.0.0.dev20241205__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/backends/backend.py +42 -15
- sky/backends/backend_utils.py +143 -9
- sky/backends/cloud_vm_ray_backend.py +103 -25
- sky/backends/local_docker_backend.py +11 -7
- sky/cli.py +11 -2
- sky/clouds/service_catalog/common.py +2 -2
- sky/core.py +25 -18
- sky/exceptions.py +7 -0
- sky/execution.py +30 -11
- sky/global_user_state.py +23 -10
- sky/jobs/controller.py +28 -8
- sky/jobs/core.py +61 -35
- sky/jobs/recovery_strategy.py +2 -1
- sky/jobs/state.py +33 -1
- sky/jobs/utils.py +16 -2
- sky/setup_files/dependencies.py +141 -0
- sky/setup_files/setup.py +12 -124
- sky/skylet/constants.py +36 -11
- sky/skylet/log_lib.py +3 -1
- sky/skylet/log_lib.pyi +3 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -2
- sky/utils/common_utils.py +19 -0
- sky/utils/controller_utils.py +60 -98
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/METADATA +3 -2
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/RECORD +30 -29
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241203.dist-info → skypilot_nightly-1.0.0.dev20241205.dist-info}/top_level.txt +0 -0
sky/core.py
CHANGED
@@ -268,7 +268,8 @@ def _start(
|
|
268
268
|
cluster_status, handle = backend_utils.refresh_cluster_status_handle(
|
269
269
|
cluster_name)
|
270
270
|
if handle is None:
|
271
|
-
raise
|
271
|
+
raise exceptions.ClusterDoesNotExist(
|
272
|
+
f'Cluster {cluster_name!r} does not exist.')
|
272
273
|
if not force and cluster_status == status_lib.ClusterStatus.UP:
|
273
274
|
sky_logging.print(f'Cluster {cluster_name!r} is already up.')
|
274
275
|
return handle
|
@@ -359,12 +360,13 @@ def start(
|
|
359
360
|
Useful for upgrading SkyPilot runtime.
|
360
361
|
|
361
362
|
Raises:
|
362
|
-
ValueError: argument values are invalid: (1)
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
363
|
+
ValueError: argument values are invalid: (1) if ``down`` is set to True
|
364
|
+
but ``idle_minutes_to_autostop`` is None; (2) if the specified
|
365
|
+
cluster is the managed jobs controller, and either
|
366
|
+
``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
|
367
|
+
them to use the default autostop settings).
|
368
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
369
|
+
exist.
|
368
370
|
sky.exceptions.NotSupportedError: if the cluster to restart was
|
369
371
|
launched using a non-default backend that does not support this
|
370
372
|
operation.
|
@@ -412,7 +414,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
412
414
|
related resources.
|
413
415
|
|
414
416
|
Raises:
|
415
|
-
|
417
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
418
|
+
exist.
|
416
419
|
RuntimeError: failed to stop the cluster.
|
417
420
|
sky.exceptions.NotSupportedError: if the specified cluster is a spot
|
418
421
|
cluster, or a TPU VM Pod cluster, or the managed jobs controller.
|
@@ -423,7 +426,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
423
426
|
f'is not supported.')
|
424
427
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
425
428
|
if handle is None:
|
426
|
-
raise
|
429
|
+
raise exceptions.ClusterDoesNotExist(
|
430
|
+
f'Cluster {cluster_name!r} does not exist.')
|
427
431
|
|
428
432
|
backend = backend_utils.get_backend_from_handle(handle)
|
429
433
|
|
@@ -467,14 +471,16 @@ def down(cluster_name: str, purge: bool = False) -> None:
|
|
467
471
|
resources.
|
468
472
|
|
469
473
|
Raises:
|
470
|
-
|
474
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
475
|
+
exist.
|
471
476
|
RuntimeError: failed to tear down the cluster.
|
472
477
|
sky.exceptions.NotSupportedError: the specified cluster is the managed
|
473
478
|
jobs controller.
|
474
479
|
"""
|
475
480
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
476
481
|
if handle is None:
|
477
|
-
raise
|
482
|
+
raise exceptions.ClusterDoesNotExist(
|
483
|
+
f'Cluster {cluster_name!r} does not exist.')
|
478
484
|
|
479
485
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
480
486
|
backend = backend_utils.get_backend_from_handle(handle)
|
@@ -521,7 +527,7 @@ def autostop(
|
|
521
527
|
rather than autostop (restartable).
|
522
528
|
|
523
529
|
Raises:
|
524
|
-
|
530
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
525
531
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
526
532
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
527
533
|
CloudVmRayBackend or the cluster is TPU VM Pod.
|
@@ -615,7 +621,7 @@ def queue(cluster_name: str,
|
|
615
621
|
}
|
616
622
|
]
|
617
623
|
raises:
|
618
|
-
|
624
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
619
625
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
620
626
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
621
627
|
CloudVmRayBackend.
|
@@ -674,7 +680,8 @@ def cancel(
|
|
674
680
|
worker node is preempted in the spot cluster.
|
675
681
|
|
676
682
|
Raises:
|
677
|
-
ValueError: if arguments are invalid
|
683
|
+
ValueError: if arguments are invalid.
|
684
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
678
685
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
679
686
|
sky.exceptions.NotSupportedError: if the specified cluster is a
|
680
687
|
controller that does not support this operation.
|
@@ -750,8 +757,8 @@ def tail_logs(cluster_name: str,
|
|
750
757
|
Please refer to the sky.cli.tail_logs for the document.
|
751
758
|
|
752
759
|
Raises:
|
753
|
-
ValueError: arguments are invalid or the cluster is not supported
|
754
|
-
|
760
|
+
ValueError: if arguments are invalid or the cluster is not supported.
|
761
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
755
762
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
756
763
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
757
764
|
CloudVmRayBackend.
|
@@ -793,7 +800,7 @@ def download_logs(
|
|
793
800
|
Returns:
|
794
801
|
Dict[str, str]: a mapping of job_id to local log path.
|
795
802
|
Raises:
|
796
|
-
|
803
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
797
804
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
798
805
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
799
806
|
CloudVmRayBackend.
|
@@ -838,7 +845,7 @@ def job_status(cluster_name: str,
|
|
838
845
|
If job_ids is None and there is no job on the cluster, it will return
|
839
846
|
{None: None}.
|
840
847
|
Raises:
|
841
|
-
|
848
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
842
849
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
843
850
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
844
851
|
CloudVmRayBackend.
|
sky/exceptions.py
CHANGED
@@ -132,6 +132,13 @@ class ClusterSetUpError(Exception):
|
|
132
132
|
pass
|
133
133
|
|
134
134
|
|
135
|
+
class ClusterDoesNotExist(ValueError):
|
136
|
+
"""Raise when trying to operate on a cluster that does not exist."""
|
137
|
+
# This extends ValueError for compatibility reasons - we used to throw
|
138
|
+
# ValueError instead of this.
|
139
|
+
pass
|
140
|
+
|
141
|
+
|
135
142
|
class NotSupportedError(Exception):
|
136
143
|
"""Raised when a feature is not supported."""
|
137
144
|
pass
|
sky/execution.py
CHANGED
@@ -108,6 +108,7 @@ def _execute(
|
|
108
108
|
idle_minutes_to_autostop: Optional[int] = None,
|
109
109
|
no_setup: bool = False,
|
110
110
|
clone_disk_from: Optional[str] = None,
|
111
|
+
skip_unnecessary_provisioning: bool = False,
|
111
112
|
# Internal only:
|
112
113
|
# pylint: disable=invalid-name
|
113
114
|
_is_launched_by_jobs_controller: bool = False,
|
@@ -128,8 +129,9 @@ def _execute(
|
|
128
129
|
Note that if errors occur during provisioning/data syncing/setting up,
|
129
130
|
the cluster will not be torn down for debugging purposes.
|
130
131
|
stream_logs: bool; whether to stream all tasks' outputs to the client.
|
131
|
-
handle: Optional[backends.ResourceHandle]; if provided, execution will
|
132
|
-
an existing backend cluster handle instead of
|
132
|
+
handle: Optional[backends.ResourceHandle]; if provided, execution will
|
133
|
+
attempt to use an existing backend cluster handle instead of
|
134
|
+
provisioning a new one.
|
133
135
|
backend: Backend; backend to use for executing the tasks. Defaults to
|
134
136
|
CloudVmRayBackend()
|
135
137
|
retry_until_up: bool; whether to retry the provisioning until the cluster
|
@@ -150,6 +152,11 @@ def _execute(
|
|
150
152
|
idle_minutes_to_autostop: int; if provided, the cluster will be set to
|
151
153
|
autostop after this many minutes of idleness.
|
152
154
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
155
|
+
clone_disk_from: Optional[str]; if set, clone the disk from the specified
|
156
|
+
cluster.
|
157
|
+
skip_unecessary_provisioning: bool; if True, compare the calculated
|
158
|
+
cluster config to the current cluster's config. If they match, shortcut
|
159
|
+
provisioning even if we have Stage.PROVISION.
|
153
160
|
|
154
161
|
Returns:
|
155
162
|
job_id: Optional[int]; the job ID of the submitted job. None if the
|
@@ -288,13 +295,18 @@ def _execute(
|
|
288
295
|
|
289
296
|
try:
|
290
297
|
if Stage.PROVISION in stages:
|
291
|
-
|
292
|
-
handle
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
+
assert handle is None or skip_unnecessary_provisioning, (
|
299
|
+
'Provisioning requested, but handle is already set. PROVISION '
|
300
|
+
'should be excluded from stages or '
|
301
|
+
'skip_unecessary_provisioning should be set. ')
|
302
|
+
handle = backend.provision(
|
303
|
+
task,
|
304
|
+
task.best_resources,
|
305
|
+
dryrun=dryrun,
|
306
|
+
stream_logs=stream_logs,
|
307
|
+
cluster_name=cluster_name,
|
308
|
+
retry_until_up=retry_until_up,
|
309
|
+
skip_unnecessary_provisioning=skip_unnecessary_provisioning)
|
298
310
|
|
299
311
|
if handle is None:
|
300
312
|
assert dryrun, ('If not dryrun, handle must be set or '
|
@@ -469,6 +481,7 @@ def launch(
|
|
469
481
|
|
470
482
|
handle = None
|
471
483
|
stages = None
|
484
|
+
skip_unnecessary_provisioning = False
|
472
485
|
# Check if cluster exists and we are doing fast provisioning
|
473
486
|
if fast and cluster_name is not None:
|
474
487
|
cluster_status, maybe_handle = (
|
@@ -502,12 +515,16 @@ def launch(
|
|
502
515
|
if cluster_status == status_lib.ClusterStatus.UP:
|
503
516
|
handle = maybe_handle
|
504
517
|
stages = [
|
518
|
+
# Provisioning will be short-circuited if the existing
|
519
|
+
# cluster config hash matches the calculated one.
|
520
|
+
Stage.PROVISION,
|
505
521
|
Stage.SYNC_WORKDIR,
|
506
522
|
Stage.SYNC_FILE_MOUNTS,
|
507
523
|
Stage.PRE_EXEC,
|
508
524
|
Stage.EXEC,
|
509
525
|
Stage.DOWN,
|
510
526
|
]
|
527
|
+
skip_unnecessary_provisioning = True
|
511
528
|
|
512
529
|
return _execute(
|
513
530
|
entrypoint=entrypoint,
|
@@ -525,6 +542,7 @@ def launch(
|
|
525
542
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
526
543
|
no_setup=no_setup,
|
527
544
|
clone_disk_from=clone_disk_from,
|
545
|
+
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
528
546
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
529
547
|
_is_launched_by_sky_serve_controller=
|
530
548
|
_is_launched_by_sky_serve_controller,
|
@@ -581,8 +599,9 @@ def exec( # pylint: disable=redefined-builtin
|
|
581
599
|
submitted.
|
582
600
|
|
583
601
|
Raises:
|
584
|
-
ValueError: if the specified cluster
|
585
|
-
|
602
|
+
ValueError: if the specified cluster is not in UP status.
|
603
|
+
sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
|
604
|
+
exist.
|
586
605
|
sky.exceptions.NotSupportedError: if the specified cluster is a
|
587
606
|
controller that does not support this operation.
|
588
607
|
|
sky/global_user_state.py
CHANGED
@@ -61,7 +61,8 @@ def create_table(cursor, conn):
|
|
61
61
|
cluster_hash TEXT DEFAULT null,
|
62
62
|
storage_mounts_metadata BLOB DEFAULT null,
|
63
63
|
cluster_ever_up INTEGER DEFAULT 0,
|
64
|
-
status_updated_at INTEGER DEFAULT null
|
64
|
+
status_updated_at INTEGER DEFAULT null,
|
65
|
+
config_hash TEXT DEFAULT null)""")
|
65
66
|
|
66
67
|
# Table for Cluster History
|
67
68
|
# usage_intervals: List[Tuple[int, int]]
|
@@ -135,6 +136,9 @@ def create_table(cursor, conn):
|
|
135
136
|
db_utils.add_column_to_table(cursor, conn, 'clusters', 'status_updated_at',
|
136
137
|
'INTEGER DEFAULT null')
|
137
138
|
|
139
|
+
db_utils.add_column_to_table(cursor, conn, 'clusters', 'config_hash',
|
140
|
+
'TEXT DEFAULT null')
|
141
|
+
|
138
142
|
conn.commit()
|
139
143
|
|
140
144
|
|
@@ -145,7 +149,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
145
149
|
cluster_handle: 'backends.ResourceHandle',
|
146
150
|
requested_resources: Optional[Set[Any]],
|
147
151
|
ready: bool,
|
148
|
-
is_launch: bool = True
|
152
|
+
is_launch: bool = True,
|
153
|
+
config_hash: Optional[str] = None):
|
149
154
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
150
155
|
|
151
156
|
Args:
|
@@ -197,7 +202,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
197
202
|
# specified.
|
198
203
|
'(name, launched_at, handle, last_use, status, '
|
199
204
|
'autostop, to_down, metadata, owner, cluster_hash, '
|
200
|
-
'storage_mounts_metadata, cluster_ever_up, status_updated_at
|
205
|
+
'storage_mounts_metadata, cluster_ever_up, status_updated_at, '
|
206
|
+
'config_hash) '
|
201
207
|
'VALUES ('
|
202
208
|
# name
|
203
209
|
'?, '
|
@@ -236,7 +242,9 @@ def add_or_update_cluster(cluster_name: str,
|
|
236
242
|
# cluster_ever_up
|
237
243
|
'((SELECT cluster_ever_up FROM clusters WHERE name=?) OR ?),'
|
238
244
|
# status_updated_at
|
239
|
-
'
|
245
|
+
'?,'
|
246
|
+
# config_hash
|
247
|
+
'COALESCE(?, (SELECT config_hash FROM clusters WHERE name=?))'
|
240
248
|
')',
|
241
249
|
(
|
242
250
|
# name
|
@@ -270,6 +278,9 @@ def add_or_update_cluster(cluster_name: str,
|
|
270
278
|
int(ready),
|
271
279
|
# status_updated_at
|
272
280
|
status_updated_at,
|
281
|
+
# config_hash
|
282
|
+
config_hash,
|
283
|
+
cluster_name,
|
273
284
|
))
|
274
285
|
|
275
286
|
launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
|
@@ -585,15 +596,15 @@ def get_cluster_from_name(
|
|
585
596
|
rows = _DB.cursor.execute(
|
586
597
|
'SELECT name, launched_at, handle, last_use, status, autostop, '
|
587
598
|
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
588
|
-
'cluster_ever_up, status_updated_at
|
589
|
-
(cluster_name,)).fetchall()
|
599
|
+
'cluster_ever_up, status_updated_at, config_hash '
|
600
|
+
'FROM clusters WHERE name=(?)', (cluster_name,)).fetchall()
|
590
601
|
for row in rows:
|
591
602
|
# Explicitly specify the number of fields to unpack, so that
|
592
603
|
# we can add new fields to the database in the future without
|
593
604
|
# breaking the previous code.
|
594
605
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
595
606
|
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
596
|
-
status_updated_at) = row[:
|
607
|
+
status_updated_at, config_hash) = row[:14]
|
597
608
|
# TODO: use namedtuple instead of dict
|
598
609
|
record = {
|
599
610
|
'name': name,
|
@@ -610,6 +621,7 @@ def get_cluster_from_name(
|
|
610
621
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
611
622
|
'cluster_ever_up': bool(cluster_ever_up),
|
612
623
|
'status_updated_at': status_updated_at,
|
624
|
+
'config_hash': config_hash,
|
613
625
|
}
|
614
626
|
return record
|
615
627
|
return None
|
@@ -619,13 +631,13 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
619
631
|
rows = _DB.cursor.execute(
|
620
632
|
'select name, launched_at, handle, last_use, status, autostop, '
|
621
633
|
'metadata, to_down, owner, cluster_hash, storage_mounts_metadata, '
|
622
|
-
'cluster_ever_up, status_updated_at
|
623
|
-
'order by launched_at desc').fetchall()
|
634
|
+
'cluster_ever_up, status_updated_at, config_hash '
|
635
|
+
'from clusters order by launched_at desc').fetchall()
|
624
636
|
records = []
|
625
637
|
for row in rows:
|
626
638
|
(name, launched_at, handle, last_use, status, autostop, metadata,
|
627
639
|
to_down, owner, cluster_hash, storage_mounts_metadata, cluster_ever_up,
|
628
|
-
status_updated_at) = row[:
|
640
|
+
status_updated_at, config_hash) = row[:14]
|
629
641
|
# TODO: use namedtuple instead of dict
|
630
642
|
record = {
|
631
643
|
'name': name,
|
@@ -642,6 +654,7 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
642
654
|
_load_storage_mounts_metadata(storage_mounts_metadata),
|
643
655
|
'cluster_ever_up': bool(cluster_ever_up),
|
644
656
|
'status_updated_at': status_updated_at,
|
657
|
+
'config_hash': config_hash,
|
645
658
|
}
|
646
659
|
|
647
660
|
records.append(record)
|
sky/jobs/controller.py
CHANGED
@@ -6,7 +6,7 @@ import pathlib
|
|
6
6
|
import time
|
7
7
|
import traceback
|
8
8
|
import typing
|
9
|
-
from typing import Tuple
|
9
|
+
from typing import Optional, Tuple
|
10
10
|
|
11
11
|
import filelock
|
12
12
|
|
@@ -87,18 +87,28 @@ class JobsController:
|
|
87
87
|
task.update_envs(task_envs)
|
88
88
|
|
89
89
|
def _download_log_and_stream(
|
90
|
-
|
91
|
-
|
92
|
-
|
90
|
+
self, task_id: Optional[int],
|
91
|
+
handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
|
92
|
+
) -> None:
|
93
|
+
"""Downloads and streams the logs of the current job with given task ID.
|
93
94
|
|
94
95
|
We do not stream the logs from the cluster directly, as the
|
95
96
|
donwload and stream should be faster, and more robust against
|
96
97
|
preemptions or ssh disconnection during the streaming.
|
97
98
|
"""
|
99
|
+
if handle is None:
|
100
|
+
logger.info(f'Cluster for job {self._job_id} is not found. '
|
101
|
+
'Skipping downloading and streaming the logs.')
|
102
|
+
return
|
98
103
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
99
104
|
'managed_jobs')
|
100
|
-
controller_utils.download_and_stream_latest_job_log(
|
105
|
+
log_file = controller_utils.download_and_stream_latest_job_log(
|
101
106
|
self._backend, handle, managed_job_logs_dir)
|
107
|
+
if log_file is not None:
|
108
|
+
# Set the path of the log file for the current task, so it can be
|
109
|
+
# accessed even after the job is finished
|
110
|
+
managed_job_state.set_local_log_file(self._job_id, task_id,
|
111
|
+
log_file)
|
102
112
|
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
103
113
|
|
104
114
|
def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
|
@@ -213,7 +223,8 @@ class JobsController:
|
|
213
223
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
214
224
|
end_time = managed_job_utils.get_job_timestamp(
|
215
225
|
self._backend, cluster_name, get_end_time=True)
|
216
|
-
# The job is done.
|
226
|
+
# The job is done. Set the job to SUCCEEDED first before start
|
227
|
+
# downloading and streaming the logs to make it more responsive.
|
217
228
|
managed_job_state.set_succeeded(self._job_id,
|
218
229
|
task_id,
|
219
230
|
end_time=end_time,
|
@@ -221,12 +232,21 @@ class JobsController:
|
|
221
232
|
logger.info(
|
222
233
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
223
234
|
f'Cleaning up the cluster {cluster_name}.')
|
235
|
+
clusters = backend_utils.get_clusters(
|
236
|
+
cluster_names=[cluster_name],
|
237
|
+
refresh=False,
|
238
|
+
include_controller=False)
|
239
|
+
if clusters:
|
240
|
+
assert len(clusters) == 1, (clusters, cluster_name)
|
241
|
+
handle = clusters[0].get('handle')
|
242
|
+
# Best effort to download and stream the logs.
|
243
|
+
self._download_log_and_stream(task_id, handle)
|
224
244
|
# Only clean up the cluster, not the storages, because tasks may
|
225
245
|
# share storages.
|
226
246
|
recovery_strategy.terminate_cluster(cluster_name=cluster_name)
|
227
247
|
return True
|
228
248
|
|
229
|
-
# For single-node jobs,
|
249
|
+
# For single-node jobs, non-terminated job_status indicates a
|
230
250
|
# healthy cluster. We can safely continue monitoring.
|
231
251
|
# For multi-node jobs, since the job may not be set to FAILED
|
232
252
|
# immediately (depending on user program) when only some of the
|
@@ -278,7 +298,7 @@ class JobsController:
|
|
278
298
|
'The user job failed. Please check the logs below.\n'
|
279
299
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
280
300
|
|
281
|
-
self._download_log_and_stream(handle)
|
301
|
+
self._download_log_and_stream(task_id, handle)
|
282
302
|
managed_job_status = (
|
283
303
|
managed_job_state.ManagedJobStatus.FAILED)
|
284
304
|
if job_status == job_lib.JobStatus.FAILED_SETUP:
|
sky/jobs/core.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""SDK functions for managed jobs."""
|
2
2
|
import os
|
3
3
|
import tempfile
|
4
|
+
import typing
|
4
5
|
from typing import Any, Dict, List, Optional, Union
|
5
6
|
import uuid
|
6
7
|
|
@@ -29,6 +30,9 @@ from sky.utils import subprocess_utils
|
|
29
30
|
from sky.utils import timeline
|
30
31
|
from sky.utils import ux_utils
|
31
32
|
|
33
|
+
if typing.TYPE_CHECKING:
|
34
|
+
from sky.backends import cloud_vm_ray_backend
|
35
|
+
|
32
36
|
|
33
37
|
@timeline.event
|
34
38
|
@usage_lib.entrypoint
|
@@ -225,6 +229,40 @@ def queue_from_kubernetes_pod(
|
|
225
229
|
return jobs
|
226
230
|
|
227
231
|
|
232
|
+
def _maybe_restart_controller(
|
233
|
+
refresh: bool, stopped_message: str, spinner_message: str
|
234
|
+
) -> 'cloud_vm_ray_backend.CloudVmRayResourceHandle':
|
235
|
+
"""Restart controller if refresh is True and it is stopped."""
|
236
|
+
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
237
|
+
if refresh:
|
238
|
+
stopped_message = ''
|
239
|
+
try:
|
240
|
+
handle = backend_utils.is_controller_accessible(
|
241
|
+
controller=jobs_controller_type, stopped_message=stopped_message)
|
242
|
+
except exceptions.ClusterNotUpError as e:
|
243
|
+
if not refresh:
|
244
|
+
raise
|
245
|
+
handle = None
|
246
|
+
controller_status = e.cluster_status
|
247
|
+
|
248
|
+
if handle is not None:
|
249
|
+
return handle
|
250
|
+
|
251
|
+
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
252
|
+
f'Restarting {jobs_controller_type.value.name}...'
|
253
|
+
f'{colorama.Style.RESET_ALL}')
|
254
|
+
|
255
|
+
rich_utils.force_update_status(
|
256
|
+
ux_utils.spinner_message(f'{spinner_message} - restarting '
|
257
|
+
'controller'))
|
258
|
+
handle = sky.start(jobs_controller_type.value.cluster_name)
|
259
|
+
controller_status = status_lib.ClusterStatus.UP
|
260
|
+
rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
|
261
|
+
|
262
|
+
assert handle is not None, (controller_status, refresh)
|
263
|
+
return handle
|
264
|
+
|
265
|
+
|
228
266
|
@usage_lib.entrypoint
|
229
267
|
def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
230
268
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
@@ -252,34 +290,11 @@ def queue(refresh: bool, skip_finished: bool = False) -> List[Dict[str, Any]]:
|
|
252
290
|
does not exist.
|
253
291
|
RuntimeError: if failed to get the managed jobs with ssh.
|
254
292
|
"""
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
handle = backend_utils.is_controller_accessible(
|
261
|
-
controller=jobs_controller_type, stopped_message=stopped_message)
|
262
|
-
except exceptions.ClusterNotUpError as e:
|
263
|
-
if not refresh:
|
264
|
-
raise
|
265
|
-
handle = None
|
266
|
-
controller_status = e.cluster_status
|
267
|
-
|
268
|
-
if refresh and handle is None:
|
269
|
-
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
270
|
-
'Restarting controller for latest status...'
|
271
|
-
f'{colorama.Style.RESET_ALL}')
|
272
|
-
|
273
|
-
rich_utils.force_update_status(
|
274
|
-
ux_utils.spinner_message('Checking managed jobs - restarting '
|
275
|
-
'controller'))
|
276
|
-
handle = sky.start(jobs_controller_type.value.cluster_name)
|
277
|
-
controller_status = status_lib.ClusterStatus.UP
|
278
|
-
rich_utils.force_update_status(
|
279
|
-
ux_utils.spinner_message('Checking managed jobs'))
|
280
|
-
|
281
|
-
assert handle is not None, (controller_status, refresh)
|
282
|
-
|
293
|
+
handle = _maybe_restart_controller(refresh,
|
294
|
+
stopped_message='No in-progress '
|
295
|
+
'managed jobs.',
|
296
|
+
spinner_message='Checking '
|
297
|
+
'managed jobs')
|
283
298
|
backend = backend_utils.get_backend_from_handle(handle)
|
284
299
|
assert isinstance(backend, backends.CloudVmRayBackend)
|
285
300
|
|
@@ -371,7 +386,7 @@ def cancel(name: Optional[str] = None,
|
|
371
386
|
|
372
387
|
@usage_lib.entrypoint
|
373
388
|
def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
374
|
-
controller: bool) -> None:
|
389
|
+
controller: bool, refresh: bool) -> None:
|
375
390
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
376
391
|
"""Tail logs of managed jobs.
|
377
392
|
|
@@ -382,15 +397,26 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
382
397
|
sky.exceptions.ClusterNotUpError: the jobs controller is not up.
|
383
398
|
"""
|
384
399
|
# TODO(zhwu): Automatically restart the jobs controller
|
400
|
+
if name is not None and job_id is not None:
|
401
|
+
with ux_utils.print_exception_no_traceback():
|
402
|
+
raise ValueError('Cannot specify both name and job_id.')
|
403
|
+
|
385
404
|
jobs_controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
386
|
-
|
387
|
-
|
405
|
+
job_name_or_id_str = ''
|
406
|
+
if job_id is not None:
|
407
|
+
job_name_or_id_str = str(job_id)
|
408
|
+
elif name is not None:
|
409
|
+
job_name_or_id_str = f'-n {name}'
|
410
|
+
else:
|
411
|
+
job_name_or_id_str = ''
|
412
|
+
handle = _maybe_restart_controller(
|
413
|
+
refresh,
|
388
414
|
stopped_message=(
|
389
|
-
'
|
390
|
-
f'
|
415
|
+
f'{jobs_controller_type.value.name.capitalize()} is stopped. To '
|
416
|
+
f'get the logs, run: {colorama.Style.BRIGHT}sky jobs logs '
|
417
|
+
f'-r {job_name_or_id_str}{colorama.Style.RESET_ALL}'),
|
418
|
+
spinner_message='Retrieving job logs')
|
391
419
|
|
392
|
-
if name is not None and job_id is not None:
|
393
|
-
raise ValueError('Cannot specify both name and job_id.')
|
394
420
|
backend = backend_utils.get_backend_from_handle(handle)
|
395
421
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
396
422
|
|
sky/jobs/recovery_strategy.py
CHANGED
@@ -50,8 +50,9 @@ def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
|
|
50
50
|
usage_lib.messages.usage.set_internal()
|
51
51
|
sky.down(cluster_name)
|
52
52
|
return
|
53
|
-
except
|
53
|
+
except exceptions.ClusterDoesNotExist:
|
54
54
|
# The cluster is already down.
|
55
|
+
logger.debug(f'The cluster {cluster_name} is already down.')
|
55
56
|
return
|
56
57
|
except Exception as e: # pylint: disable=broad-except
|
57
58
|
retry_cnt += 1
|
sky/jobs/state.py
CHANGED
@@ -66,7 +66,8 @@ def create_table(cursor, conn):
|
|
66
66
|
spot_job_id INTEGER,
|
67
67
|
task_id INTEGER DEFAULT 0,
|
68
68
|
task_name TEXT,
|
69
|
-
specs TEXT
|
69
|
+
specs TEXT,
|
70
|
+
local_log_file TEXT DEFAULT NULL)""")
|
70
71
|
conn.commit()
|
71
72
|
|
72
73
|
db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
|
@@ -103,6 +104,8 @@ def create_table(cursor, conn):
|
|
103
104
|
value_to_replace_existing_entries=json.dumps({
|
104
105
|
'max_restarts_on_errors': 0,
|
105
106
|
}))
|
107
|
+
db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
|
108
|
+
'TEXT DEFAULT NULL')
|
106
109
|
|
107
110
|
# `job_info` contains the mapping from job_id to the job_name.
|
108
111
|
# In the future, it may contain more information about each job.
|
@@ -157,6 +160,7 @@ columns = [
|
|
157
160
|
'task_id',
|
158
161
|
'task_name',
|
159
162
|
'specs',
|
163
|
+
'local_log_file',
|
160
164
|
# columns from the job_info table
|
161
165
|
'_job_info_job_id', # This should be the same as job_id
|
162
166
|
'job_name',
|
@@ -512,6 +516,20 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
|
|
512
516
|
callback_func('CANCELLED')
|
513
517
|
|
514
518
|
|
519
|
+
def set_local_log_file(job_id: int, task_id: Optional[int],
|
520
|
+
local_log_file: str):
|
521
|
+
"""Set the local log file for a job."""
|
522
|
+
filter_str = 'spot_job_id=(?)'
|
523
|
+
filter_args = [local_log_file, job_id]
|
524
|
+
if task_id is not None:
|
525
|
+
filter_str += ' AND task_id=(?)'
|
526
|
+
filter_args.append(task_id)
|
527
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
528
|
+
cursor.execute(
|
529
|
+
'UPDATE spot SET local_log_file=(?) '
|
530
|
+
f'WHERE {filter_str}', filter_args)
|
531
|
+
|
532
|
+
|
515
533
|
# ======== utility functions ========
|
516
534
|
def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
|
517
535
|
"""Get non-terminal job ids by name."""
|
@@ -662,3 +680,17 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
|
|
662
680
|
WHERE spot_job_id=(?) AND task_id=(?)""",
|
663
681
|
(job_id, task_id)).fetchone()
|
664
682
|
return json.loads(task_specs[0])
|
683
|
+
|
684
|
+
|
685
|
+
def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
|
686
|
+
"""Get the local log directory for a job."""
|
687
|
+
filter_str = 'spot_job_id=(?)'
|
688
|
+
filter_args = [job_id]
|
689
|
+
if task_id is not None:
|
690
|
+
filter_str += ' AND task_id=(?)'
|
691
|
+
filter_args.append(task_id)
|
692
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
693
|
+
local_log_file = cursor.execute(
|
694
|
+
f'SELECT local_log_file FROM spot '
|
695
|
+
f'WHERE {filter_str}', filter_args).fetchone()
|
696
|
+
return local_log_file[-1] if local_log_file else None
|