skypilot-nightly 1.0.0.dev20250124__py3-none-any.whl → 1.0.0.dev20250125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/jobs/controller.py +4 -1
- sky/jobs/state.py +79 -14
- sky/jobs/utils.py +188 -139
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +1 -1
- {skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/RECORD +12 -12
- {skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '485b1cd4688d5ac984cc666f372b55009cb064b7'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250125'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/jobs/controller.py
CHANGED
sky/jobs/state.py
CHANGED
@@ -230,12 +230,12 @@ class ManagedJobStatus(enum.Enum):
|
|
230
230
|
# RECOVERING: The cluster is preempted, and the controller process is
|
231
231
|
# recovering the cluster (relaunching/failover).
|
232
232
|
RECOVERING = 'RECOVERING'
|
233
|
-
# Terminal statuses
|
234
|
-
# SUCCEEDED: The job is finished successfully.
|
235
|
-
SUCCEEDED = 'SUCCEEDED'
|
236
233
|
# CANCELLING: The job is requested to be cancelled by the user, and the
|
237
234
|
# controller is cleaning up the cluster.
|
238
235
|
CANCELLING = 'CANCELLING'
|
236
|
+
# Terminal statuses
|
237
|
+
# SUCCEEDED: The job is finished successfully.
|
238
|
+
SUCCEEDED = 'SUCCEEDED'
|
239
239
|
# CANCELLED: The job is cancelled by the user. When the managed job is in
|
240
240
|
# CANCELLED status, the cluster has been cleaned up.
|
241
241
|
CANCELLED = 'CANCELLED'
|
@@ -281,7 +281,6 @@ class ManagedJobStatus(enum.Enum):
|
|
281
281
|
cls.FAILED_PRECHECKS,
|
282
282
|
cls.FAILED_NO_RESOURCE,
|
283
283
|
cls.FAILED_CONTROLLER,
|
284
|
-
cls.CANCELLING,
|
285
284
|
cls.CANCELLED,
|
286
285
|
]
|
287
286
|
|
@@ -512,8 +511,12 @@ def set_failed(
|
|
512
511
|
failure_reason: str,
|
513
512
|
callback_func: Optional[CallbackType] = None,
|
514
513
|
end_time: Optional[float] = None,
|
514
|
+
override_terminal: bool = False,
|
515
515
|
):
|
516
|
-
"""Set an entire job or task to failed
|
516
|
+
"""Set an entire job or task to failed.
|
517
|
+
|
518
|
+
By default, don't override tasks that are already terminal (that is, for
|
519
|
+
which end_at is already set).
|
517
520
|
|
518
521
|
Args:
|
519
522
|
job_id: The job id.
|
@@ -522,12 +525,13 @@ def set_failed(
|
|
522
525
|
failure_type: The failure type. One of ManagedJobStatus.FAILED_*.
|
523
526
|
failure_reason: The failure reason.
|
524
527
|
end_time: The end time. If None, the current time will be used.
|
528
|
+
override_terminal: If True, override the current status even if end_at
|
529
|
+
is already set.
|
525
530
|
"""
|
526
531
|
assert failure_type.is_failed(), failure_type
|
527
532
|
end_time = time.time() if end_time is None else end_time
|
528
533
|
|
529
|
-
fields_to_set = {
|
530
|
-
'end_at': end_time,
|
534
|
+
fields_to_set: Dict[str, Any] = {
|
531
535
|
'status': failure_type.value,
|
532
536
|
'failure_reason': failure_reason,
|
533
537
|
}
|
@@ -542,14 +546,31 @@ def set_failed(
|
|
542
546
|
# affect the job duration calculation.
|
543
547
|
fields_to_set['last_recovered_at'] = end_time
|
544
548
|
set_str = ', '.join(f'{k}=(?)' for k in fields_to_set)
|
545
|
-
|
549
|
+
task_query_str = '' if task_id is None else 'AND task_id=(?)'
|
550
|
+
task_value = [] if task_id is None else [
|
551
|
+
task_id,
|
552
|
+
]
|
546
553
|
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
554
|
+
if override_terminal:
|
555
|
+
# Use COALESCE for end_at to avoid overriding the existing end_at if
|
556
|
+
# it's already set.
|
557
|
+
cursor.execute(
|
558
|
+
f"""\
|
559
|
+
UPDATE spot SET
|
560
|
+
end_at = COALESCE(end_at, ?),
|
561
|
+
{set_str}
|
562
|
+
WHERE spot_job_id=(?) {task_query_str}""",
|
563
|
+
(end_time, *list(fields_to_set.values()), job_id, *task_value))
|
564
|
+
else:
|
565
|
+
# Only set if end_at is null, i.e. the previous status is not
|
566
|
+
# terminal.
|
567
|
+
cursor.execute(
|
568
|
+
f"""\
|
569
|
+
UPDATE spot SET
|
570
|
+
end_at = (?),
|
571
|
+
{set_str}
|
572
|
+
WHERE spot_job_id=(?) {task_query_str} AND end_at IS null""",
|
573
|
+
(end_time, *list(fields_to_set.values()), job_id, *task_value))
|
553
574
|
if callback_func:
|
554
575
|
callback_func('FAILED')
|
555
576
|
logger.info(failure_reason)
|
@@ -677,6 +698,50 @@ def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
|
|
677
698
|
return jobs
|
678
699
|
|
679
700
|
|
701
|
+
def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
|
702
|
+
"""Get jobs that need controller process checking.
|
703
|
+
|
704
|
+
Args:
|
705
|
+
job_id: Optional job ID to check. If None, checks all jobs.
|
706
|
+
|
707
|
+
Returns a list of job_ids, including the following:
|
708
|
+
- For jobs with schedule state: jobs that have schedule state not DONE
|
709
|
+
- For legacy jobs (no schedule state): jobs that are in non-terminal status
|
710
|
+
"""
|
711
|
+
job_filter = '' if job_id is None else 'AND spot.spot_job_id=(?)'
|
712
|
+
job_value = () if job_id is None else (job_id,)
|
713
|
+
|
714
|
+
status_filter_str = ', '.join(['?'] *
|
715
|
+
len(ManagedJobStatus.terminal_statuses()))
|
716
|
+
terminal_status_values = [
|
717
|
+
status.value for status in ManagedJobStatus.terminal_statuses()
|
718
|
+
]
|
719
|
+
|
720
|
+
# Get jobs that are either:
|
721
|
+
# 1. Have schedule state that is not DONE, or
|
722
|
+
# 2. Have no schedule state (legacy) AND are in non-terminal status
|
723
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
724
|
+
rows = cursor.execute(
|
725
|
+
f"""\
|
726
|
+
SELECT DISTINCT spot.spot_job_id
|
727
|
+
FROM spot
|
728
|
+
LEFT OUTER JOIN job_info
|
729
|
+
ON spot.spot_job_id=job_info.spot_job_id
|
730
|
+
WHERE (
|
731
|
+
(job_info.schedule_state IS NOT NULL AND
|
732
|
+
job_info.schedule_state IS NOT ?)
|
733
|
+
OR
|
734
|
+
(job_info.schedule_state IS NULL AND
|
735
|
+
status NOT IN ({status_filter_str}))
|
736
|
+
)
|
737
|
+
{job_filter}
|
738
|
+
ORDER BY spot.spot_job_id DESC""", [
|
739
|
+
ManagedJobScheduleState.DONE.value, *terminal_status_values,
|
740
|
+
*job_value
|
741
|
+
]).fetchall()
|
742
|
+
return [row[0] for row in rows if row[0] is not None]
|
743
|
+
|
744
|
+
|
680
745
|
def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
681
746
|
"""Get all job ids by name."""
|
682
747
|
name_filter = ''
|
sky/jobs/utils.py
CHANGED
@@ -159,7 +159,7 @@ def _controller_process_alive(pid: int, job_id: int) -> bool:
|
|
159
159
|
return False
|
160
160
|
|
161
161
|
|
162
|
-
def
|
162
|
+
def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
163
163
|
"""Update managed job status if the controller process failed abnormally.
|
164
164
|
|
165
165
|
Check the status of the controller process. If it is not running, it must
|
@@ -168,125 +168,175 @@ def update_managed_job_status(job_id: Optional[int] = None):
|
|
168
168
|
when above happens, which could be not accurate based on the frequency this
|
169
169
|
function is called.
|
170
170
|
|
171
|
-
Note: we expect that job_id, if provided, refers to a nonterminal job
|
171
|
+
Note: we expect that job_id, if provided, refers to a nonterminal job or a
|
172
|
+
job that has not completed its cleanup (schedule state not DONE).
|
172
173
|
"""
|
173
174
|
|
174
|
-
|
175
|
-
|
176
|
-
# a terminal status during the course of this function. The set_failed()
|
177
|
-
# called below will not update the state for jobs that already have a
|
178
|
-
# terminal status, so it should be fine.
|
179
|
-
job_ids = managed_job_state.get_nonterminal_job_ids_by_name(None)
|
180
|
-
else:
|
181
|
-
job_ids = [job_id]
|
182
|
-
for job_id_ in job_ids:
|
183
|
-
|
184
|
-
failure_reason = None
|
185
|
-
|
186
|
-
tasks = managed_job_state.get_managed_jobs(job_id_)
|
187
|
-
schedule_state = tasks[0]['schedule_state']
|
188
|
-
if schedule_state is None:
|
189
|
-
# Backwards compatibility: this job was submitted when ray was still
|
190
|
-
# used for managing the parallelism of job controllers.
|
191
|
-
# TODO(cooperc): Remove before 0.11.0.
|
192
|
-
controller_status = job_lib.get_status(job_id_)
|
193
|
-
if controller_status is None or controller_status.is_terminal():
|
194
|
-
logger.error(f'Controller process for legacy job {job_id_} is '
|
195
|
-
'in an unexpected state.')
|
196
|
-
failure_reason = 'Legacy job is in an unexpected state'
|
197
|
-
|
198
|
-
# Continue to mark the job as failed.
|
199
|
-
else:
|
200
|
-
# Still running.
|
201
|
-
continue
|
202
|
-
else:
|
203
|
-
pid = tasks[0]['controller_pid']
|
204
|
-
if pid is None:
|
205
|
-
if schedule_state in (
|
206
|
-
managed_job_state.ManagedJobScheduleState.INACTIVE,
|
207
|
-
managed_job_state.ManagedJobScheduleState.WAITING):
|
208
|
-
# Job has not been scheduled yet.
|
209
|
-
continue
|
210
|
-
elif (schedule_state ==
|
211
|
-
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
212
|
-
# This should only be the case for a very short period of
|
213
|
-
# time between marking the job as submitted and writing the
|
214
|
-
# launched controller process pid back to the database (see
|
215
|
-
# scheduler.maybe_schedule_next_jobs).
|
216
|
-
# TODO(cooperc): Find a way to detect if we get stuck in
|
217
|
-
# this state.
|
218
|
-
logger.info(f'Job {job_id_} is in LAUNCHING state, '
|
219
|
-
'but controller process hasn\'t started yet.')
|
220
|
-
continue
|
221
|
-
# All other statuses are unexpected. Proceed to mark as failed.
|
222
|
-
logger.error(f'Expected to find a controller pid for state '
|
223
|
-
f'{schedule_state.value} but found none.')
|
224
|
-
failure_reason = ('No controller pid set for '
|
225
|
-
f'{schedule_state.value}')
|
226
|
-
else:
|
227
|
-
logger.debug(f'Checking controller pid {pid}')
|
228
|
-
if _controller_process_alive(pid, job_id_):
|
229
|
-
# The controller is still running.
|
230
|
-
continue
|
231
|
-
# Otherwise, proceed to mark the job as failed.
|
232
|
-
logger.error(f'Controller process for {job_id_} seems to be '
|
233
|
-
'dead.')
|
234
|
-
failure_reason = 'Controller process is dead'
|
175
|
+
def _cleanup_job_clusters(job_id: int) -> Optional[str]:
|
176
|
+
"""Clean up clusters for a job. Returns error message if any.
|
235
177
|
|
236
|
-
|
237
|
-
|
178
|
+
This function should not throw any exception. If it fails, it will
|
179
|
+
capture the error message, and log/return it.
|
180
|
+
"""
|
181
|
+
error_msg = None
|
182
|
+
tasks = managed_job_state.get_managed_jobs(job_id)
|
238
183
|
for task in tasks:
|
239
184
|
task_name = task['job_name']
|
240
|
-
|
241
|
-
cluster_name = generate_managed_job_cluster_name(task_name, job_id_)
|
185
|
+
cluster_name = generate_managed_job_cluster_name(task_name, job_id)
|
242
186
|
handle = global_user_state.get_handle_from_cluster_name(
|
243
187
|
cluster_name)
|
244
|
-
# If the cluster exists, terminate it.
|
245
188
|
if handle is not None:
|
246
|
-
|
189
|
+
try:
|
190
|
+
terminate_cluster(cluster_name)
|
191
|
+
except Exception as e: # pylint: disable=broad-except
|
192
|
+
error_msg = (
|
193
|
+
f'Failed to terminate cluster {cluster_name}: '
|
194
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
195
|
+
logger.exception(error_msg, exc_info=e)
|
196
|
+
return error_msg
|
197
|
+
|
198
|
+
# For backwards compatible jobs
|
199
|
+
# TODO(cooperc): Remove before 0.11.0.
|
200
|
+
def _handle_legacy_job(job_id: int):
|
201
|
+
controller_status = job_lib.get_status(job_id)
|
202
|
+
if controller_status is None or controller_status.is_terminal():
|
203
|
+
logger.error(f'Controller process for legacy job {job_id} is '
|
204
|
+
'in an unexpected state.')
|
205
|
+
|
206
|
+
cleanup_error = _cleanup_job_clusters(job_id)
|
207
|
+
if cleanup_error:
|
208
|
+
# Unconditionally set the job to failed_controller if the
|
209
|
+
# cleanup fails.
|
210
|
+
managed_job_state.set_failed(
|
211
|
+
job_id,
|
212
|
+
task_id=None,
|
213
|
+
failure_type=managed_job_state.ManagedJobStatus.
|
214
|
+
FAILED_CONTROLLER,
|
215
|
+
failure_reason=
|
216
|
+
'Legacy controller process has exited abnormally, and '
|
217
|
+
f'cleanup failed: {cleanup_error}. For more details, run: '
|
218
|
+
f'sky jobs logs --controller {job_id}',
|
219
|
+
override_terminal=True)
|
220
|
+
return
|
221
|
+
|
222
|
+
# It's possible for the job to have transitioned to
|
223
|
+
# another terminal state while between when we checked its
|
224
|
+
# state and now. In that case, set_failed won't do
|
225
|
+
# anything, which is fine.
|
226
|
+
managed_job_state.set_failed(
|
227
|
+
job_id,
|
228
|
+
task_id=None,
|
229
|
+
failure_type=managed_job_state.ManagedJobStatus.
|
230
|
+
FAILED_CONTROLLER,
|
231
|
+
failure_reason=(
|
232
|
+
'Legacy controller process has exited abnormally. For '
|
233
|
+
f'more details, run: sky jobs logs --controller {job_id}'))
|
234
|
+
|
235
|
+
# Get jobs that need checking (non-terminal or not DONE)
|
236
|
+
job_ids = managed_job_state.get_jobs_to_check_status(job_id)
|
237
|
+
if not job_ids:
|
238
|
+
# job_id is already terminal, or if job_id is None, there are no jobs
|
239
|
+
# that need to be checked.
|
240
|
+
return
|
241
|
+
|
242
|
+
for job_id in job_ids:
|
243
|
+
tasks = managed_job_state.get_managed_jobs(job_id)
|
244
|
+
# Note: controller_pid and schedule_state are in the job_info table
|
245
|
+
# which is joined to the spot table, so all tasks with the same job_id
|
246
|
+
# will have the same value for these columns. This is what lets us just
|
247
|
+
# take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
|
248
|
+
schedule_state = tasks[0]['schedule_state']
|
249
|
+
|
250
|
+
# Backwards compatibility: this job was submitted when ray was still
|
251
|
+
# used for managing the parallelism of job controllers.
|
252
|
+
# TODO(cooperc): Remove before 0.11.0.
|
253
|
+
if (schedule_state is
|
254
|
+
managed_job_state.ManagedJobScheduleState.INVALID):
|
255
|
+
_handle_legacy_job(job_id)
|
256
|
+
continue
|
257
|
+
|
258
|
+
# For jobs with schedule state:
|
259
|
+
pid = tasks[0]['controller_pid']
|
260
|
+
if pid is None:
|
261
|
+
if schedule_state in (
|
262
|
+
managed_job_state.ManagedJobScheduleState.INACTIVE,
|
263
|
+
managed_job_state.ManagedJobScheduleState.WAITING):
|
264
|
+
# For these states, the controller hasn't been started yet.
|
265
|
+
# This is expected.
|
266
|
+
continue
|
267
|
+
|
268
|
+
if (schedule_state ==
|
269
|
+
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
270
|
+
# This is unlikely but technically possible. There's a brief
|
271
|
+
# period between marking job as scheduled (LAUNCHING) and
|
272
|
+
# actually launching the controller process and writing the pid
|
273
|
+
# back to the table.
|
274
|
+
# TODO(cooperc): Find a way to detect if we get stuck in this
|
275
|
+
# state.
|
276
|
+
logger.info(f'Job {job_id} is in {schedule_state.value} state, '
|
277
|
+
'but controller process hasn\'t started yet.')
|
278
|
+
continue
|
279
|
+
|
280
|
+
logger.error(f'Expected to find a controller pid for state '
|
281
|
+
f'{schedule_state.value} but found none.')
|
282
|
+
failure_reason = f'No controller pid set for {schedule_state.value}'
|
283
|
+
else:
|
284
|
+
logger.debug(f'Checking controller pid {pid}')
|
285
|
+
if _controller_process_alive(pid, job_id):
|
286
|
+
# The controller is still running, so this job is fine.
|
287
|
+
continue
|
288
|
+
|
289
|
+
# Double check job is not already DONE before marking as failed, to
|
290
|
+
# avoid the race where the controller marked itself as DONE and
|
291
|
+
# exited between the state check and the pid check. Since the job
|
292
|
+
# controller process will mark itself DONE _before_ exiting, if it
|
293
|
+
# has exited and it's still not DONE now, it is abnormal.
|
294
|
+
if (managed_job_state.get_job_schedule_state(job_id) ==
|
295
|
+
managed_job_state.ManagedJobScheduleState.DONE):
|
296
|
+
# Never mind, the job is DONE now. This is fine.
|
297
|
+
continue
|
298
|
+
|
299
|
+
logger.error(f'Controller process for {job_id} seems to be dead.')
|
300
|
+
failure_reason = 'Controller process is dead'
|
301
|
+
|
302
|
+
# At this point, either pid is None or process is dead.
|
247
303
|
|
248
304
|
# The controller process for this managed job is not running: it must
|
249
305
|
# have exited abnormally, and we should set the job status to
|
250
306
|
# FAILED_CONTROLLER.
|
251
|
-
|
252
|
-
|
307
|
+
logger.error(f'Controller process for job {job_id} has exited '
|
308
|
+
'abnormally. Setting the job status to FAILED_CONTROLLER.')
|
309
|
+
|
310
|
+
# Cleanup clusters and capture any errors.
|
311
|
+
cleanup_error = _cleanup_job_clusters(job_id)
|
312
|
+
cleanup_error_msg = ''
|
313
|
+
if cleanup_error:
|
314
|
+
cleanup_error_msg = f'Also, cleanup failed: {cleanup_error}. '
|
315
|
+
|
316
|
+
# Set all tasks to FAILED_CONTROLLER, regardless of current status.
|
317
|
+
# This may change a job from SUCCEEDED or another terminal state to
|
318
|
+
# FAILED_CONTROLLER. This is what we want - we are sure that this
|
319
|
+
# controller process crashed, so we want to capture that even if the
|
320
|
+
# underlying job succeeded.
|
321
|
+
# Note: 2+ invocations of update_managed_jobs_statuses could be running
|
322
|
+
# at the same time, so this could override the FAILED_CONTROLLER status
|
323
|
+
# set by another invocation of update_managed_jobs_statuses. That should
|
324
|
+
# be okay. The only difference could be that one process failed to clean
|
325
|
+
# up the cluster while the other succeeds. No matter which
|
326
|
+
# failure_reason ends up in the database, the outcome is acceptable.
|
327
|
+
# We assume that no other code path outside the controller process will
|
328
|
+
# update the job status.
|
253
329
|
managed_job_state.set_failed(
|
254
|
-
|
330
|
+
job_id,
|
255
331
|
task_id=None,
|
256
332
|
failure_type=managed_job_state.ManagedJobStatus.FAILED_CONTROLLER,
|
257
333
|
failure_reason=
|
258
|
-
f'Controller process has exited abnormally ({failure_reason}).
|
259
|
-
f'
|
260
|
-
|
261
|
-
|
262
|
-
# Some jobs may be in a terminal status, but are not yet DONE. For instance,
|
263
|
-
# they may be still cleaning up resources, etc. Such jobs won't be captured
|
264
|
-
# by the above check, which only looks at nonterminal jobs. So, check the
|
265
|
-
# controller liveness of all jobs that should have live controller
|
266
|
-
# processes.
|
267
|
-
for job_info in managed_job_state.get_schedule_live_jobs(job_id):
|
268
|
-
if not job_info['controller_pid']:
|
269
|
-
# Technically, a job with no controller process but in LAUNCHING
|
270
|
-
# schedule state can happen very briefly after the job is set to
|
271
|
-
# LAUNCHING but before the controller process is actually spawned.
|
272
|
-
# However, if we observe any state other than LAUNCHING, something
|
273
|
-
# is clearly wrong.
|
274
|
-
if (job_info['schedule_state'] !=
|
275
|
-
managed_job_state.ManagedJobScheduleState.LAUNCHING):
|
276
|
-
logger.error(
|
277
|
-
f'Missing controller PID for {job_info["job_id"]}. '
|
278
|
-
'Setting to DONE.')
|
279
|
-
scheduler.job_done(job_info['job_id'])
|
280
|
-
else:
|
281
|
-
logger.info(f'LAUNCHING job {job_info["job_id"]} has no '
|
282
|
-
'controller process yet. Skipping.')
|
334
|
+
f'Controller process has exited abnormally ({failure_reason}). '
|
335
|
+
f'{cleanup_error_msg}'
|
336
|
+
f'For more details, run: sky jobs logs --controller {job_id}',
|
337
|
+
override_terminal=True)
|
283
338
|
|
284
|
-
|
285
|
-
job_info['job_id']):
|
286
|
-
logger.error(
|
287
|
-
f'Controller process for job {job_info["job_id"]} is not '
|
288
|
-
'alive. Marking the job as DONE.')
|
289
|
-
scheduler.job_done(job_info['job_id'])
|
339
|
+
scheduler.job_done(job_id, idempotent=True)
|
290
340
|
|
291
341
|
|
292
342
|
def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
|
@@ -382,7 +432,7 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]]) -> str:
|
|
382
432
|
f'{job_status.value}. Skipped.')
|
383
433
|
continue
|
384
434
|
|
385
|
-
|
435
|
+
update_managed_jobs_statuses(job_id)
|
386
436
|
|
387
437
|
# Send the signal to the jobs controller.
|
388
438
|
signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
|
@@ -424,36 +474,24 @@ def cancel_job_by_name(job_name: str) -> str:
|
|
424
474
|
|
425
475
|
def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
426
476
|
"""Stream logs by job id."""
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
477
|
+
|
478
|
+
def should_keep_logging(status: managed_job_state.ManagedJobStatus) -> bool:
|
479
|
+
# If we see CANCELLING, just exit - we could miss some job logs but the
|
480
|
+
# job will be terminated momentarily anyway so we don't really care.
|
481
|
+
return (not status.is_terminal() and
|
482
|
+
status != managed_job_state.ManagedJobStatus.CANCELLING)
|
483
|
+
|
484
|
+
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
|
485
|
+
status_display = rich_utils.safe_status(msg)
|
431
486
|
num_tasks = managed_job_state.get_num_tasks(job_id)
|
432
487
|
|
433
488
|
with status_display:
|
434
|
-
prev_msg = None
|
435
|
-
while (controller_status != job_lib.JobStatus.RUNNING and
|
436
|
-
(controller_status is None or
|
437
|
-
not controller_status.is_terminal())):
|
438
|
-
status_str = 'None'
|
439
|
-
if controller_status is not None:
|
440
|
-
status_str = controller_status.value
|
441
|
-
msg = status_msg.format(status_str=f' (status: {status_str})')
|
442
|
-
if msg != prev_msg:
|
443
|
-
status_display.update(msg)
|
444
|
-
prev_msg = msg
|
445
|
-
time.sleep(_LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS)
|
446
|
-
controller_status = job_lib.get_status(job_id)
|
447
|
-
|
448
|
-
msg = _JOB_WAITING_STATUS_MESSAGE.format(status_str='', job_id=job_id)
|
449
|
-
status_display.update(msg)
|
450
489
|
prev_msg = msg
|
451
|
-
managed_job_status
|
452
|
-
|
490
|
+
while (managed_job_status :=
|
491
|
+
managed_job_state.get_status(job_id)) is None:
|
453
492
|
time.sleep(1)
|
454
|
-
managed_job_status = managed_job_state.get_status(job_id)
|
455
493
|
|
456
|
-
if managed_job_status
|
494
|
+
if not should_keep_logging(managed_job_status):
|
457
495
|
job_msg = ''
|
458
496
|
if managed_job_status.is_failed():
|
459
497
|
job_msg = ('\nFailure reason: '
|
@@ -480,10 +518,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
480
518
|
task_id, managed_job_status = (
|
481
519
|
managed_job_state.get_latest_task_id_status(job_id))
|
482
520
|
|
483
|
-
#
|
484
|
-
#
|
485
|
-
|
486
|
-
|
521
|
+
# We wait for managed_job_status to be not None above. Once we see that
|
522
|
+
# it's not None, we don't expect it to every become None again.
|
523
|
+
assert managed_job_status is not None, (job_id, task_id,
|
524
|
+
managed_job_status)
|
525
|
+
|
526
|
+
while should_keep_logging(managed_job_status):
|
487
527
|
handle = None
|
488
528
|
if task_id is not None:
|
489
529
|
task_name = managed_job_state.get_task_name(job_id, task_id)
|
@@ -513,8 +553,11 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
513
553
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
514
554
|
task_id, managed_job_status = (
|
515
555
|
managed_job_state.get_latest_task_id_status(job_id))
|
556
|
+
assert managed_job_status is not None, (job_id, task_id,
|
557
|
+
managed_job_status)
|
516
558
|
continue
|
517
|
-
assert managed_job_status
|
559
|
+
assert (managed_job_status ==
|
560
|
+
managed_job_state.ManagedJobStatus.RUNNING)
|
518
561
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
519
562
|
status_display.stop()
|
520
563
|
returncode = backend.tail_logs(handle,
|
@@ -568,6 +611,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
568
611
|
managed_job_status :=
|
569
612
|
managed_job_state.get_status(job_id)):
|
570
613
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
614
|
+
assert managed_job_status is not None, (
|
615
|
+
job_id, managed_job_status)
|
571
616
|
continue
|
572
617
|
|
573
618
|
if task_id == num_tasks - 1:
|
@@ -593,6 +638,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
593
638
|
if original_task_id != task_id:
|
594
639
|
break
|
595
640
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
641
|
+
assert managed_job_status is not None, (job_id, task_id,
|
642
|
+
managed_job_status)
|
596
643
|
continue
|
597
644
|
|
598
645
|
# The job can be cancelled by the user or the controller (when
|
@@ -608,7 +655,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
608
655
|
# state.
|
609
656
|
managed_job_status = managed_job_state.get_status(job_id)
|
610
657
|
assert managed_job_status is not None, job_id
|
611
|
-
if managed_job_status
|
658
|
+
if not should_keep_logging(managed_job_status):
|
612
659
|
break
|
613
660
|
logger.info(f'{colorama.Fore.YELLOW}The job cluster is preempted '
|
614
661
|
f'or failed.{colorama.Style.RESET_ALL}')
|
@@ -623,6 +670,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
623
670
|
# managed job state is updated.
|
624
671
|
time.sleep(3 * JOB_STATUS_CHECK_GAP_SECONDS)
|
625
672
|
managed_job_status = managed_job_state.get_status(job_id)
|
673
|
+
assert managed_job_status is not None, (job_id, managed_job_status)
|
626
674
|
|
627
675
|
# The managed_job_status may not be in terminal status yet, since the
|
628
676
|
# controller has not updated the managed job state yet. We wait for a while,
|
@@ -630,7 +678,7 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
630
678
|
wait_seconds = 0
|
631
679
|
managed_job_status = managed_job_state.get_status(job_id)
|
632
680
|
assert managed_job_status is not None, job_id
|
633
|
-
while (
|
681
|
+
while (should_keep_logging(managed_job_status) and follow and
|
634
682
|
wait_seconds < _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS):
|
635
683
|
time.sleep(1)
|
636
684
|
wait_seconds += 1
|
@@ -694,10 +742,7 @@ def stream_logs(job_id: Optional[int],
|
|
694
742
|
if job_status is None:
|
695
743
|
with ux_utils.print_exception_no_traceback():
|
696
744
|
raise ValueError(f'Job {job_id} not found.')
|
697
|
-
|
698
|
-
# still cleaning up.
|
699
|
-
if (job_status.is_terminal() and job_status !=
|
700
|
-
managed_job_state.ManagedJobStatus.CANCELLING):
|
745
|
+
if job_status.is_terminal():
|
701
746
|
# Don't keep waiting. If the log file is not created by this
|
702
747
|
# point, it never will be. This job may have been submitted
|
703
748
|
# using an old version that did not create the log file, so this
|
@@ -729,6 +774,10 @@ def stream_logs(job_id: Optional[int],
|
|
729
774
|
print(end='', flush=True)
|
730
775
|
|
731
776
|
# Check if the job if finished.
|
777
|
+
# TODO(cooperc): The controller can still be
|
778
|
+
# cleaning up if job is in a terminal status
|
779
|
+
# (e.g. SUCCEEDED). We want to follow those logs
|
780
|
+
# too. Use DONE instead?
|
732
781
|
job_status = managed_job_state.get_status(job_id)
|
733
782
|
assert job_status is not None, (job_id, job_name)
|
734
783
|
if job_status.is_terminal():
|
sky/skylet/constants.py
CHANGED
@@ -86,7 +86,7 @@ TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'
|
|
86
86
|
# cluster yaml is updated.
|
87
87
|
#
|
88
88
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
89
|
-
SKYLET_VERSION = '
|
89
|
+
SKYLET_VERSION = '11'
|
90
90
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
91
91
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
92
92
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
sky/skylet/events.py
CHANGED
{skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=byguYOHI2wvQyWrh97v5OmKwEiIEv4lxNHqBSxTPCXc,5944
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=LXUDABKP1FJCS256xTTDJa40WXwHKF5x49S-4hZbD1M,21501
|
4
4
|
sky/check.py,sha256=qTpm3N1zUZi2inEZPsrbt278B3h8nsk2gnepzIgLybE,10899
|
@@ -98,12 +98,12 @@ sky/data/storage.py,sha256=CWVKnHhdzXw1biPbRqYizkyVexL_OCELuJCqtd4hit4,204094
|
|
98
98
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
99
99
|
sky/jobs/__init__.py,sha256=ObZcz3lL1ip8JcmR6gbfZ4RMMfXJJdsnuU2zLQUb8jY,1546
|
100
100
|
sky/jobs/constants.py,sha256=6RphkJ6pmafQ7XYW5qwId1Zvqb99HJelA9kgrgfNR7o,1421
|
101
|
-
sky/jobs/controller.py,sha256=
|
101
|
+
sky/jobs/controller.py,sha256=0WcOk8xRZ-mZWuza-WE-ICKZTgZvXxNzj9pWXUslm6E,28312
|
102
102
|
sky/jobs/core.py,sha256=2_Q9thiBPnd3i2nDqyUtQY-dsGZ1kRgAdnLcXHoycYo,19938
|
103
103
|
sky/jobs/recovery_strategy.py,sha256=m-EA-MWXPFrgx2CYFPr6MmgeUoDTEBmY2xruD2PRSGY,26365
|
104
104
|
sky/jobs/scheduler.py,sha256=WAvNb8-vBk8q1zFordFdpH7gxqWDjPHDGZZay6aodOk,12028
|
105
|
-
sky/jobs/state.py,sha256=
|
106
|
-
sky/jobs/utils.py,sha256=
|
105
|
+
sky/jobs/state.py,sha256=bvBNZMg3DzPfS4eHNzMqYaMui2cqnWoWGDIaiOpaXSk,40770
|
106
|
+
sky/jobs/utils.py,sha256=RGVytFmB6SmKK3qZp_8UID_T5ssxSJOgwCDgIvRmhtM,51785
|
107
107
|
sky/jobs/dashboard/dashboard.py,sha256=KMSarpVcfnc-ELPFvy1M9_I1k4kSeXubTk3ibQC67Tg,3219
|
108
108
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
109
109
|
sky/jobs/dashboard/templates/index.html,sha256=su1tqgcsXNl1lGl9hfIR6ig1f531OO57x1Tc2mNDK7U,11139
|
@@ -207,8 +207,8 @@ sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
207
207
|
sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
|
208
208
|
sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,4478
|
209
209
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
210
|
-
sky/skylet/constants.py,sha256=
|
211
|
-
sky/skylet/events.py,sha256=
|
210
|
+
sky/skylet/constants.py,sha256=uLEVhMZXpIlj7b_03ixAI6rC6fTM1k5xPUWR4LvzQyo,16022
|
211
|
+
sky/skylet/events.py,sha256=0bOjUYpphuAficD9wDB5NOan2vwJDaRqdnm4sl0RK0U,12535
|
212
212
|
sky/skylet/job_lib.py,sha256=Rk-C069cusJIRXsks8xqCb016JSt7GlpU7LrpX0qFJk,42785
|
213
213
|
sky/skylet/log_lib.py,sha256=oFEBd85vDYFrIyyZKekH30yc4rRYILC0F0o-COQ64oE,20445
|
214
214
|
sky/skylet/log_lib.pyi,sha256=rRk4eUX0RHGs1QL9CXsJq6RE7FqqxZlfuPJOLXTvg7I,4453
|
@@ -289,9 +289,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
289
289
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
290
290
|
sky/utils/kubernetes/rsync_helper.sh,sha256=h4YwrPFf9727CACnMJvF3EyK_0OeOYKKt4su_daKekw,1256
|
291
291
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=Kq1MDygF2IxFmu9FXpCxqucXLmeUrvs6OtRij6XTQbo,6554
|
292
|
-
skypilot_nightly-1.0.0.
|
293
|
-
skypilot_nightly-1.0.0.
|
294
|
-
skypilot_nightly-1.0.0.
|
295
|
-
skypilot_nightly-1.0.0.
|
296
|
-
skypilot_nightly-1.0.0.
|
297
|
-
skypilot_nightly-1.0.0.
|
292
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
293
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/METADATA,sha256=8ozTZDBrQLiIaTS3-_CStvAfJE7XPmuwGGWneS_gj7o,21038
|
294
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
295
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
296
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
297
|
+
skypilot_nightly-1.0.0.dev20250125.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20250124.dist-info → skypilot_nightly-1.0.0.dev20250125.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|