skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -67
- sky/check.py +31 -1
- sky/cli.py +11 -34
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/core.py +8 -5
- sky/data/storage.py +66 -14
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +263 -21
- sky/jobs/utils.py +338 -96
- sky/provision/aws/config.py +48 -26
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +76 -18
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +13 -0
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py
CHANGED
@@ -13,23 +13,28 @@ import shlex
|
|
13
13
|
import shutil
|
14
14
|
import textwrap
|
15
15
|
import time
|
16
|
+
import traceback
|
16
17
|
import typing
|
17
18
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
18
19
|
|
19
20
|
import colorama
|
20
21
|
import filelock
|
22
|
+
import psutil
|
21
23
|
from typing_extensions import Literal
|
22
24
|
|
25
|
+
import sky
|
23
26
|
from sky import backends
|
24
27
|
from sky import exceptions
|
25
28
|
from sky import global_user_state
|
26
29
|
from sky import sky_logging
|
27
30
|
from sky.backends import backend_utils
|
28
31
|
from sky.jobs import constants as managed_job_constants
|
32
|
+
from sky.jobs import scheduler
|
29
33
|
from sky.jobs import state as managed_job_state
|
30
34
|
from sky.skylet import constants
|
31
35
|
from sky.skylet import job_lib
|
32
36
|
from sky.skylet import log_lib
|
37
|
+
from sky.usage import usage_lib
|
33
38
|
from sky.utils import common_utils
|
34
39
|
from sky.utils import log_utils
|
35
40
|
from sky.utils import rich_utils
|
@@ -37,7 +42,6 @@ from sky.utils import subprocess_utils
|
|
37
42
|
from sky.utils import ux_utils
|
38
43
|
|
39
44
|
if typing.TYPE_CHECKING:
|
40
|
-
import sky
|
41
45
|
from sky import dag as dag_lib
|
42
46
|
|
43
47
|
logger = sky_logging.init_logger(__name__)
|
@@ -69,8 +73,10 @@ _JOB_CANCELLED_MESSAGE = (
|
|
69
73
|
# The maximum time to wait for the managed job status to transition to terminal
|
70
74
|
# state, after the job finished. This is a safeguard to avoid the case where
|
71
75
|
# the managed job status fails to be updated and keep the `sky jobs logs`
|
72
|
-
# blocking for a long time.
|
73
|
-
|
76
|
+
# blocking for a long time. This should be significantly longer than the
|
77
|
+
# JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
|
78
|
+
# update the state.
|
79
|
+
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
|
74
80
|
|
75
81
|
|
76
82
|
class UserSignal(enum.Enum):
|
@@ -81,6 +87,43 @@ class UserSignal(enum.Enum):
|
|
81
87
|
|
82
88
|
|
83
89
|
# ====== internal functions ======
|
90
|
+
def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
|
91
|
+
"""Terminate the cluster."""
|
92
|
+
retry_cnt = 0
|
93
|
+
# In some cases, e.g. botocore.exceptions.NoCredentialsError due to AWS
|
94
|
+
# metadata service throttling, the failed sky.down attempt can take 10-11
|
95
|
+
# seconds. In this case, we need the backoff to significantly reduce the
|
96
|
+
# rate of requests - that is, significantly increase the time between
|
97
|
+
# requests. We set the initial backoff to 15 seconds, so that once it grows
|
98
|
+
# exponentially it will quickly dominate the 10-11 seconds that we already
|
99
|
+
# see between requests. We set the max backoff very high, since it's
|
100
|
+
# generally much more important to eventually succeed than to fail fast.
|
101
|
+
backoff = common_utils.Backoff(
|
102
|
+
initial_backoff=15,
|
103
|
+
# 1.6 ** 5 = 10.48576 < 20, so we won't hit this with default max_retry
|
104
|
+
max_backoff_factor=20)
|
105
|
+
while True:
|
106
|
+
try:
|
107
|
+
usage_lib.messages.usage.set_internal()
|
108
|
+
sky.down(cluster_name)
|
109
|
+
return
|
110
|
+
except exceptions.ClusterDoesNotExist:
|
111
|
+
# The cluster is already down.
|
112
|
+
logger.debug(f'The cluster {cluster_name} is already down.')
|
113
|
+
return
|
114
|
+
except Exception as e: # pylint: disable=broad-except
|
115
|
+
retry_cnt += 1
|
116
|
+
if retry_cnt >= max_retry:
|
117
|
+
raise RuntimeError(
|
118
|
+
f'Failed to terminate the cluster {cluster_name}.') from e
|
119
|
+
logger.error(
|
120
|
+
f'Failed to terminate the cluster {cluster_name}. Retrying.'
|
121
|
+
f'Details: {common_utils.format_exception(e)}')
|
122
|
+
with ux_utils.enable_traceback():
|
123
|
+
logger.error(f' Traceback: {traceback.format_exc()}')
|
124
|
+
time.sleep(backoff.current_backoff())
|
125
|
+
|
126
|
+
|
84
127
|
def get_job_status(backend: 'backends.CloudVmRayBackend',
|
85
128
|
cluster_name: str) -> Optional['job_lib.JobStatus']:
|
86
129
|
"""Check the status of the job running on a managed job cluster.
|
@@ -105,57 +148,145 @@ def get_job_status(backend: 'backends.CloudVmRayBackend',
|
|
105
148
|
return status
|
106
149
|
|
107
150
|
|
151
|
+
def _controller_process_alive(pid: int, job_id: int) -> bool:
|
152
|
+
"""Check if the controller process is alive."""
|
153
|
+
try:
|
154
|
+
process = psutil.Process(pid)
|
155
|
+
# The last two args of the command line should be --job-id <id>
|
156
|
+
job_args = process.cmdline()[-2:]
|
157
|
+
return process.is_running() and job_args == ['--job-id', str(job_id)]
|
158
|
+
except psutil.NoSuchProcess:
|
159
|
+
return False
|
160
|
+
|
161
|
+
|
108
162
|
def update_managed_job_status(job_id: Optional[int] = None):
|
109
|
-
"""Update managed job status if the controller
|
163
|
+
"""Update managed job status if the controller process failed abnormally.
|
164
|
+
|
165
|
+
Check the status of the controller process. If it is not running, it must
|
166
|
+
have exited abnormally, and we should set the job status to
|
167
|
+
FAILED_CONTROLLER. `end_at` will be set to the current timestamp for the job
|
168
|
+
when above happens, which could be not accurate based on the frequency this
|
169
|
+
function is called.
|
110
170
|
|
111
|
-
|
112
|
-
exited abnormally, and we should set the job status to FAILED_CONTROLLER.
|
113
|
-
`end_at` will be set to the current timestamp for the job when above
|
114
|
-
happens, which could be not accurate based on the frequency this function
|
115
|
-
is called.
|
171
|
+
Note: we expect that job_id, if provided, refers to a nonterminal job.
|
116
172
|
"""
|
173
|
+
|
117
174
|
if job_id is None:
|
175
|
+
# Warning: it's totally possible for the managed job to transition to
|
176
|
+
# a terminal status during the course of this function. The set_failed()
|
177
|
+
# called below will not update the state for jobs that already have a
|
178
|
+
# terminal status, so it should be fine.
|
118
179
|
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
|
119
180
|
else:
|
120
181
|
job_ids = [job_id]
|
121
182
|
for job_id_ in job_ids:
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
183
|
+
|
184
|
+
failure_reason = None
|
185
|
+
|
186
|
+
tasks = managed_job_state.get_managed_jobs(job_id_)
|
187
|
+
schedule_state = tasks[0]['schedule_state']
|
188
|
+
if schedule_state is None:
|
189
|
+
# Backwards compatibility: this job was submitted when ray was still
|
190
|
+
# used for managing the parallelism of job controllers.
|
191
|
+
# TODO(cooperc): Remove before 0.11.0.
|
192
|
+
controller_status = job_lib.get_status(job_id_)
|
193
|
+
if controller_status is None or controller_status.is_terminal():
|
194
|
+
logger.error(f'Controller process for legacy job {job_id_} is '
|
195
|
+
'in an unexpected state.')
|
196
|
+
failure_reason = 'Legacy job is in an unexpected state'
|
197
|
+
|
198
|
+
# Continue to mark the job as failed.
|
199
|
+
else:
|
200
|
+
# Still running.
|
201
|
+
continue
|
202
|
+
else:
|
203
|
+
pid = tasks[0]['controller_pid']
|
204
|
+
if pid is None:
|
205
|
+
if schedule_state in (
|
206
|
+
managed_job_state.ManagedJobScheduleState.INACTIVE,
|
207
|
+
managed_job_state.ManagedJobScheduleState.WAITING):
|
208
|
+
# Job has not been scheduled yet.
|
209
|
+
continue
|
210
|
+
elif (schedule_state ==
|
211
|
+
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
212
|
+
# This should only be the case for a very short period of
|
213
|
+
# time between marking the job as submitted and writing the
|
214
|
+
# launched controller process pid back to the database (see
|
215
|
+
# scheduler.maybe_schedule_next_jobs).
|
216
|
+
# TODO(cooperc): Find a way to detect if we get stuck in
|
217
|
+
# this state.
|
218
|
+
logger.info(f'Job {job_id_} is in LAUNCHING state, '
|
219
|
+
'but controller process hasn\'t started yet.')
|
220
|
+
continue
|
221
|
+
# All other statuses are unexpected. Proceed to mark as failed.
|
222
|
+
logger.error(f'Expected to find a controller pid for state '
|
223
|
+
f'{schedule_state.value} but found none.')
|
224
|
+
failure_reason = ('No controller pid set for '
|
225
|
+
f'{schedule_state.value}')
|
226
|
+
else:
|
227
|
+
logger.debug(f'Checking controller pid {pid}')
|
228
|
+
if _controller_process_alive(pid, job_id_):
|
229
|
+
# The controller is still running.
|
230
|
+
continue
|
231
|
+
# Otherwise, proceed to mark the job as failed.
|
232
|
+
logger.error(f'Controller process for {job_id_} seems to be '
|
233
|
+
'dead.')
|
234
|
+
failure_reason = 'Controller process is dead'
|
235
|
+
|
236
|
+
logger.error(f'Controller process for job {job_id_} has exited '
|
237
|
+
'abnormally. Setting the job status to FAILED_CONTROLLER.')
|
238
|
+
for task in tasks:
|
239
|
+
task_name = task['job_name']
|
240
|
+
# Tear down the abnormal cluster to avoid resource leakage.
|
241
|
+
cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
|
242
|
+
handle = global_user_state.get_handle_from_cluster_name(
|
243
|
+
cluster_name)
|
244
|
+
# If the cluster exists, terminate it.
|
245
|
+
if handle is not None:
|
246
|
+
terminate_cluster(cluster_name)
|
247
|
+
|
248
|
+
# The controller process for this managed job is not running: it must
|
249
|
+
# have exited abnormally, and we should set the job status to
|
250
|
+
# FAILED_CONTROLLER.
|
251
|
+
# The `set_failed` will only update the task's status if the
|
252
|
+
# status is non-terminal.
|
253
|
+
managed_job_state.set_failed(
|
254
|
+
job_id_,
|
255
|
+
task_id=None,
|
256
|
+
failure_type=managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
257
|
+
failure_reason=
|
258
|
+
f'Controller process has exited abnormally ({failure_reason}). For '
|
259
|
+
f'more details, run: sky jobs logs --controller {job_id_}')
|
260
|
+
scheduler.job_done(job_id_, idempotent=True)
|
261
|
+
|
262
|
+
# Some jobs may be in a terminal status, but are not yet DONE. For instance,
|
263
|
+
# they may be still cleaning up resources, etc. Such jobs won't be captured
|
264
|
+
# by the above check, which only looks at nonterminal jobs. So, check the
|
265
|
+
# controller liveness of all jobs that should have live controller
|
266
|
+
# processes.
|
267
|
+
for job_info in managed_job_state.get_schedule_live_jobs(job_id):
|
268
|
+
if not job_info['controller_pid']:
|
269
|
+
# Technically, a job with no controller process but in LAUNCHING
|
270
|
+
# schedule state can happen very briefly after the job is set to
|
271
|
+
# LAUNCHING but before the controller process is actually spawned.
|
272
|
+
# However, if we observe any state other than LAUNCHING, something
|
273
|
+
# is clearly wrong.
|
274
|
+
if (job_info['schedule_state'] !=
|
275
|
+
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
276
|
+
logger.error(
|
277
|
+
f'Missing controller PID for {job_info["job_id"]}. '
|
278
|
+
'Setting to DONE.')
|
279
|
+
scheduler.job_done(job_info['job_id'])
|
280
|
+
else:
|
281
|
+
logger.info(f'LAUNCHING job {job_info["job_id"]} has no '
|
282
|
+
'controller process yet. Skipping.')
|
283
|
+
|
284
|
+
elif not _controller_process_alive(job_info['controller_pid'],
|
285
|
+
job_info['job_id']):
|
286
|
+
logger.error(
|
287
|
+
f'Controller process for job {job_info["job_id"]} is not '
|
288
|
+
'alive. Marking the job as DONE.')
|
289
|
+
scheduler.job_done(job_info['job_id'])
|
159
290
|
|
160
291
|
|
161
292
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
@@ -398,32 +529,15 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
398
529
|
job_statuses = backend.get_job_status(handle, stream_logs=False)
|
399
530
|
job_status = list(job_statuses.values())[0]
|
400
531
|
assert job_status is not None, 'No job found.'
|
532
|
+
assert task_id is not None, job_id
|
533
|
+
|
401
534
|
if job_status != job_lib.JobStatus.CANCELLED:
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
'is finished. Waiting for the next task\'s log '
|
409
|
-
'to be started.')
|
410
|
-
# Add a newline to avoid the status display below
|
411
|
-
# removing the last line of the task output.
|
412
|
-
print()
|
413
|
-
status_display.update(
|
414
|
-
ux_utils.spinner_message(
|
415
|
-
f'Waiting for the next task: {task_id + 1}'))
|
416
|
-
status_display.start()
|
417
|
-
original_task_id = task_id
|
418
|
-
while True:
|
419
|
-
task_id, managed_job_status = (
|
420
|
-
managed_job_state.get_latest_task_id_status(
|
421
|
-
job_id))
|
422
|
-
if original_task_id != task_id:
|
423
|
-
break
|
424
|
-
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
425
|
-
continue
|
426
|
-
else:
|
535
|
+
if not follow:
|
536
|
+
break
|
537
|
+
|
538
|
+
# Logs for retrying failed tasks.
|
539
|
+
if (job_status
|
540
|
+
in job_lib.JobStatus.user_code_failure_states()):
|
427
541
|
task_specs = managed_job_state.get_task_specs(
|
428
542
|
job_id, task_id)
|
429
543
|
if task_specs.get('max_restarts_on_errors', 0) == 0:
|
@@ -436,15 +550,51 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
436
550
|
ux_utils.spinner_message(
|
437
551
|
'Waiting for next restart for the failed task'))
|
438
552
|
status_display.start()
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
if
|
444
|
-
|
445
|
-
|
553
|
+
|
554
|
+
def is_managed_job_status_updated(
|
555
|
+
status: Optional[managed_job_state.ManagedJobStatus]
|
556
|
+
) -> bool:
|
557
|
+
"""Check if local managed job status reflects remote
|
558
|
+
job failure.
|
559
|
+
|
560
|
+
Ensures synchronization between remote cluster
|
561
|
+
failure detection (JobStatus.FAILED) and controller
|
562
|
+
retry logic.
|
563
|
+
"""
|
564
|
+
return (status !=
|
565
|
+
managed_job_state.ManagedJobStatus.RUNNING)
|
566
|
+
|
567
|
+
while not is_managed_job_status_updated(
|
568
|
+
managed_job_status :=
|
569
|
+
managed_job_state.get_status(job_id)):
|
446
570
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
447
571
|
continue
|
572
|
+
|
573
|
+
if task_id == num_tasks - 1:
|
574
|
+
break
|
575
|
+
|
576
|
+
# The log for the current job is finished. We need to
|
577
|
+
# wait until next job to be started.
|
578
|
+
logger.debug(
|
579
|
+
f'INFO: Log for the current task ({task_id}) '
|
580
|
+
'is finished. Waiting for the next task\'s log '
|
581
|
+
'to be started.')
|
582
|
+
# Add a newline to avoid the status display below
|
583
|
+
# removing the last line of the task output.
|
584
|
+
print()
|
585
|
+
status_display.update(
|
586
|
+
ux_utils.spinner_message(
|
587
|
+
f'Waiting for the next task: {task_id + 1}'))
|
588
|
+
status_display.start()
|
589
|
+
original_task_id = task_id
|
590
|
+
while True:
|
591
|
+
task_id, managed_job_status = (
|
592
|
+
managed_job_state.get_latest_task_id_status(job_id))
|
593
|
+
if original_task_id != task_id:
|
594
|
+
break
|
595
|
+
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
596
|
+
continue
|
597
|
+
|
448
598
|
# The job can be cancelled by the user or the controller (when
|
449
599
|
# the cluster is partially preempted).
|
450
600
|
logger.debug(
|
@@ -527,15 +677,75 @@ def stream_logs(job_id: Optional[int],
|
|
527
677
|
'instead.')
|
528
678
|
job_id = managed_job_ids.pop()
|
529
679
|
assert job_id is not None, (job_id, job_name)
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
680
|
+
|
681
|
+
controller_log_path = os.path.join(
|
682
|
+
os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
|
683
|
+
f'{job_id}.log')
|
684
|
+
job_status = None
|
685
|
+
|
686
|
+
# Wait for the log file to be written
|
687
|
+
while not os.path.exists(controller_log_path):
|
688
|
+
if not follow:
|
689
|
+
# Assume that the log file hasn't been written yet. Since we
|
690
|
+
# aren't following, just return.
|
691
|
+
return ''
|
692
|
+
|
693
|
+
job_status = managed_job_state.get_status(job_id)
|
694
|
+
if job_status is None:
|
695
|
+
with ux_utils.print_exception_no_traceback():
|
696
|
+
raise ValueError(f'Job {job_id} not found.')
|
697
|
+
# We shouldn't count CANCELLING as terminal here, the controller is
|
698
|
+
# still cleaning up.
|
699
|
+
if (job_status.is_terminal() and job_status !=
|
700
|
+
managed_job_state.ManagedJobStatus.CANCELLING):
|
701
|
+
# Don't keep waiting. If the log file is not created by this
|
702
|
+
# point, it never will be. This job may have been submitted
|
703
|
+
# using an old version that did not create the log file, so this
|
704
|
+
# is not considered an exceptional case.
|
705
|
+
return ''
|
706
|
+
|
707
|
+
time.sleep(log_lib.SKY_LOG_WAITING_GAP_SECONDS)
|
708
|
+
|
709
|
+
# This code is based on log_lib.tail_logs. We can't use that code
|
710
|
+
# exactly because state works differently between managed jobs and
|
711
|
+
# normal jobs.
|
712
|
+
with open(controller_log_path, 'r', newline='', encoding='utf-8') as f:
|
713
|
+
# Note: we do not need to care about start_stream_at here, since
|
714
|
+
# that should be in the job log printed above.
|
715
|
+
for line in f:
|
716
|
+
print(line, end='')
|
717
|
+
# Flush.
|
718
|
+
print(end='', flush=True)
|
719
|
+
|
720
|
+
if follow:
|
721
|
+
while True:
|
722
|
+
# Print all new lines, if there are any.
|
723
|
+
line = f.readline()
|
724
|
+
while line is not None and line != '':
|
725
|
+
print(line, end='')
|
726
|
+
line = f.readline()
|
727
|
+
|
728
|
+
# Flush.
|
729
|
+
print(end='', flush=True)
|
730
|
+
|
731
|
+
# Check if the job if finished.
|
732
|
+
job_status = managed_job_state.get_status(job_id)
|
733
|
+
assert job_status is not None, (job_id, job_name)
|
734
|
+
if job_status.is_terminal():
|
735
|
+
break
|
736
|
+
|
737
|
+
time.sleep(log_lib.SKY_LOG_TAILING_GAP_SECONDS)
|
738
|
+
|
739
|
+
# Wait for final logs to be written.
|
740
|
+
time.sleep(1 + log_lib.SKY_LOG_TAILING_GAP_SECONDS)
|
741
|
+
|
742
|
+
# Print any remaining logs including incomplete line.
|
743
|
+
print(f.read(), end='', flush=True)
|
744
|
+
|
745
|
+
if follow:
|
746
|
+
return ux_utils.finishing_message(
|
747
|
+
f'Job finished (status: {job_status}).')
|
748
|
+
|
539
749
|
return ''
|
540
750
|
|
541
751
|
if job_id is None:
|
@@ -571,6 +781,7 @@ def dump_managed_job_queue() -> str:
|
|
571
781
|
job_duration = 0
|
572
782
|
job['job_duration'] = job_duration
|
573
783
|
job['status'] = job['status'].value
|
784
|
+
job['schedule_state'] = job['schedule_state'].value
|
574
785
|
|
575
786
|
cluster_name = generate_managed_job_cluster_name(
|
576
787
|
job['task_name'], job['job_id'])
|
@@ -672,11 +883,18 @@ def format_job_table(
|
|
672
883
|
status_counts[managed_job_status.value] += 1
|
673
884
|
|
674
885
|
columns = [
|
675
|
-
'ID',
|
676
|
-
'
|
886
|
+
'ID',
|
887
|
+
'TASK',
|
888
|
+
'NAME',
|
889
|
+
'RESOURCES',
|
890
|
+
'SUBMITTED',
|
891
|
+
'TOT. DURATION',
|
892
|
+
'JOB DURATION',
|
893
|
+
'#RECOVERIES',
|
894
|
+
'STATUS',
|
677
895
|
]
|
678
896
|
if show_all:
|
679
|
-
columns += ['STARTED', 'CLUSTER', 'REGION', '
|
897
|
+
columns += ['STARTED', 'CLUSTER', 'REGION', 'DESCRIPTION']
|
680
898
|
if tasks_have_user:
|
681
899
|
columns.insert(0, 'USER')
|
682
900
|
job_table = log_utils.create_table(columns)
|
@@ -695,7 +913,25 @@ def format_job_table(
|
|
695
913
|
# by the task_id.
|
696
914
|
jobs[get_hash(task)].append(task)
|
697
915
|
|
916
|
+
def generate_description(failure_reason: Optional[str],
|
917
|
+
schedule_state: Optional[str]) -> str:
|
918
|
+
description = ''
|
919
|
+
if schedule_state is not None:
|
920
|
+
description += f'Scheduler: {schedule_state}'
|
921
|
+
if failure_reason is not None:
|
922
|
+
description += ', '
|
923
|
+
if failure_reason is not None:
|
924
|
+
description += f'Failure: {failure_reason}'
|
925
|
+
|
926
|
+
if description == '':
|
927
|
+
return '-'
|
928
|
+
|
929
|
+
return description
|
930
|
+
|
698
931
|
for job_hash, job_tasks in jobs.items():
|
932
|
+
if show_all:
|
933
|
+
schedule_state = job_tasks[0]['schedule_state']
|
934
|
+
|
699
935
|
if len(job_tasks) > 1:
|
700
936
|
# Aggregate the tasks into a new row in the table.
|
701
937
|
job_name = job_tasks[0]['job_name']
|
@@ -718,7 +954,6 @@ def format_job_table(
|
|
718
954
|
end_at = None
|
719
955
|
recovery_cnt += task['recovery_count']
|
720
956
|
|
721
|
-
failure_reason = job_tasks[current_task_id]['failure_reason']
|
722
957
|
job_duration = log_utils.readable_time_duration(0,
|
723
958
|
job_duration,
|
724
959
|
absolute=True)
|
@@ -744,11 +979,13 @@ def format_job_table(
|
|
744
979
|
status_str,
|
745
980
|
]
|
746
981
|
if show_all:
|
982
|
+
schedule_state = job_tasks[0]['schedule_state']
|
983
|
+
failure_reason = job_tasks[current_task_id]['failure_reason']
|
747
984
|
job_values.extend([
|
748
985
|
'-',
|
749
986
|
'-',
|
750
987
|
'-',
|
751
|
-
failure_reason
|
988
|
+
generate_description(failure_reason, schedule_state),
|
752
989
|
])
|
753
990
|
if tasks_have_user:
|
754
991
|
job_values.insert(0, job_tasks[0].get('user', '-'))
|
@@ -776,13 +1013,17 @@ def format_job_table(
|
|
776
1013
|
task['status'].colored_str(),
|
777
1014
|
]
|
778
1015
|
if show_all:
|
1016
|
+
# schedule_state is only set at the job level, so if we have
|
1017
|
+
# more than one task, only display on the aggregated row.
|
1018
|
+
schedule_state = (task['schedule_state']
|
1019
|
+
if len(job_tasks) == 1 else None)
|
779
1020
|
values.extend([
|
780
1021
|
# STARTED
|
781
1022
|
log_utils.readable_time_duration(task['start_at']),
|
782
1023
|
task['cluster_resources'],
|
783
1024
|
task['region'],
|
784
|
-
task['failure_reason']
|
785
|
-
|
1025
|
+
generate_description(task['failure_reason'],
|
1026
|
+
schedule_state),
|
786
1027
|
])
|
787
1028
|
if tasks_have_user:
|
788
1029
|
values.insert(0, task.get('user', '-'))
|
@@ -856,7 +1097,7 @@ class ManagedJobCodeGen:
|
|
856
1097
|
return cls._build(code)
|
857
1098
|
|
858
1099
|
@classmethod
|
859
|
-
def get_all_job_ids_by_name(cls, job_name: str) -> str:
|
1100
|
+
def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
|
860
1101
|
code = textwrap.dedent(f"""\
|
861
1102
|
from sky.utils import common_utils
|
862
1103
|
job_id = managed_job_state.get_all_job_ids_by_name({job_name!r})
|
@@ -877,6 +1118,7 @@ class ManagedJobCodeGen:
|
|
877
1118
|
# should be removed in v0.8.0.
|
878
1119
|
code = textwrap.dedent("""\
|
879
1120
|
import os
|
1121
|
+
import time
|
880
1122
|
|
881
1123
|
from sky.skylet import job_lib, log_lib
|
882
1124
|
from sky.skylet import constants
|
@@ -901,7 +1143,7 @@ class ManagedJobCodeGen:
|
|
901
1143
|
dag_name = managed_job_dag.name
|
902
1144
|
# Add the managed job to queue table.
|
903
1145
|
code = textwrap.dedent(f"""\
|
904
|
-
managed_job_state.
|
1146
|
+
managed_job_state.set_job_info({job_id}, {dag_name!r})
|
905
1147
|
""")
|
906
1148
|
for task_id, task in enumerate(managed_job_dag.tasks):
|
907
1149
|
resources_str = backend_utils.get_task_resources_str(
|