skypilot-nightly 1.0.0.dev20250116__py3-none-any.whl → 1.0.0.dev20250117__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -29
- sky/cli.py +11 -34
- sky/core.py +8 -5
- sky/data/storage.py +16 -7
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +14 -16
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +251 -17
- sky/jobs/utils.py +287 -64
- sky/provision/kubernetes/instance.py +1 -1
- sky/resources.py +1 -1
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +2 -26
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/utils/resources_utils.py +25 -21
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/RECORD +30 -29
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250116.dist-info → skypilot_nightly-1.0.0.dev20250117.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
@@ -107,12 +107,25 @@ def create_table(cursor, conn):
|
|
107
107
|
db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
|
108
108
|
'TEXT DEFAULT NULL')
|
109
109
|
|
110
|
-
# `job_info` contains the mapping from job_id to the job_name
|
111
|
-
#
|
110
|
+
# `job_info` contains the mapping from job_id to the job_name, as well as
|
111
|
+
# information used by the scheduler.
|
112
112
|
cursor.execute("""\
|
113
113
|
CREATE TABLE IF NOT EXISTS job_info (
|
114
114
|
spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
|
115
|
-
name TEXT
|
115
|
+
name TEXT,
|
116
|
+
schedule_state TEXT,
|
117
|
+
controller_pid INTEGER DEFAULT NULL,
|
118
|
+
dag_yaml_path TEXT)""")
|
119
|
+
|
120
|
+
db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
|
121
|
+
'TEXT')
|
122
|
+
|
123
|
+
db_utils.add_column_to_table(cursor, conn, 'job_info', 'controller_pid',
|
124
|
+
'INTEGER DEFAULT NULL')
|
125
|
+
|
126
|
+
db_utils.add_column_to_table(cursor, conn, 'job_info', 'dag_yaml_path',
|
127
|
+
'TEXT')
|
128
|
+
|
116
129
|
conn.commit()
|
117
130
|
|
118
131
|
|
@@ -164,6 +177,9 @@ columns = [
|
|
164
177
|
# columns from the job_info table
|
165
178
|
'_job_info_job_id', # This should be the same as job_id
|
166
179
|
'job_name',
|
180
|
+
'schedule_state',
|
181
|
+
'controller_pid',
|
182
|
+
'dag_yaml_path',
|
167
183
|
]
|
168
184
|
|
169
185
|
|
@@ -189,16 +205,18 @@ class ManagedJobStatus(enum.Enum):
|
|
189
205
|
SUCCEEDED -> SUCCEEDED
|
190
206
|
FAILED -> FAILED
|
191
207
|
FAILED_SETUP -> FAILED_SETUP
|
208
|
+
Not all statuses are in this list, since some ManagedJobStatuses are only
|
209
|
+
possible while the cluster is INIT/STOPPED/not yet UP.
|
192
210
|
Note that the JobStatus will not be stuck in PENDING, because each cluster
|
193
211
|
is dedicated to a managed job, i.e. there should always be enough resource
|
194
212
|
to run the job and the job will be immediately transitioned to RUNNING.
|
213
|
+
|
195
214
|
"""
|
196
215
|
# PENDING: Waiting for the jobs controller to have a slot to run the
|
197
216
|
# controller process.
|
198
|
-
# The submitted_at timestamp of the managed job in the 'spot' table will be
|
199
|
-
# set to the time when the job is firstly submitted by the user (set to
|
200
|
-
# PENDING).
|
201
217
|
PENDING = 'PENDING'
|
218
|
+
# The submitted_at timestamp of the managed job in the 'spot' table will be
|
219
|
+
# set to the time when the job controller begins running.
|
202
220
|
# SUBMITTED: The jobs controller starts the controller process.
|
203
221
|
SUBMITTED = 'SUBMITTED'
|
204
222
|
# STARTING: The controller process is launching the cluster for the managed
|
@@ -292,14 +310,66 @@ _SPOT_STATUS_TO_COLOR = {
|
|
292
310
|
}
|
293
311
|
|
294
312
|
|
313
|
+
class ManagedJobScheduleState(enum.Enum):
|
314
|
+
"""Captures the state of the job from the scheduler's perspective.
|
315
|
+
|
316
|
+
A newly created job will be INACTIVE. The following transitions are valid:
|
317
|
+
- INACTIVE -> WAITING: The job is "submitted" to the scheduler, and its job
|
318
|
+
controller can be started.
|
319
|
+
- WAITING -> LAUNCHING: The job controller is starting by the scheduler and
|
320
|
+
may proceed to sky.launch.
|
321
|
+
- LAUNCHING -> ALIVE: The launch attempt was completed. It may have
|
322
|
+
succeeded or failed. The job controller is not allowed to sky.launch again
|
323
|
+
without transitioning to ALIVE_WAITING and then LAUNCHING.
|
324
|
+
- ALIVE -> ALIVE_WAITING: The job controller wants to sky.launch again,
|
325
|
+
either for recovery or to launch a subsequent task.
|
326
|
+
- ALIVE_WAITING -> LAUNCHING: The scheduler has determined that the job
|
327
|
+
controller may launch again.
|
328
|
+
- LAUNCHING, ALIVE, or ALIVE_WAITING -> DONE: The job controller is exiting
|
329
|
+
and the job is in some terminal status. In the future it may be possible
|
330
|
+
to transition directly from WAITING or even INACTIVE to DONE if the job is
|
331
|
+
cancelled.
|
332
|
+
|
333
|
+
There is no well-defined mapping from the managed job status to schedule
|
334
|
+
state or vice versa. (In fact, schedule state is defined on the job and
|
335
|
+
status on the task.)
|
336
|
+
- INACTIVE or WAITING should only be seen when a job is PENDING.
|
337
|
+
- ALIVE_WAITING should only be seen when a job is RECOVERING, has multiple
|
338
|
+
tasks, or needs to retry launching.
|
339
|
+
- LAUNCHING and ALIVE can be seen in many different statuses.
|
340
|
+
- DONE should only be seen when a job is in a terminal status.
|
341
|
+
Since state and status transitions are not atomic, it may be possible to
|
342
|
+
briefly observe inconsistent states, like a job that just finished but
|
343
|
+
hasn't yet transitioned to DONE.
|
344
|
+
"""
|
345
|
+
# The job should be ignored by the scheduler.
|
346
|
+
INACTIVE = 'INACTIVE'
|
347
|
+
# The job is waiting to transition to LAUNCHING for the first time. The
|
348
|
+
# scheduler should try to transition it, and when it does, it should start
|
349
|
+
# the job controller.
|
350
|
+
WAITING = 'WAITING'
|
351
|
+
# The job is already alive, but wants to transition back to LAUNCHING,
|
352
|
+
# e.g. for recovery, or launching later tasks in the DAG. The scheduler
|
353
|
+
# should try to transition it to LAUNCHING.
|
354
|
+
ALIVE_WAITING = 'ALIVE_WAITING'
|
355
|
+
# The job is running sky.launch, or soon will, using a limited number of
|
356
|
+
# allowed launch slots.
|
357
|
+
LAUNCHING = 'LAUNCHING'
|
358
|
+
# The controller for the job is running, but it's not currently launching.
|
359
|
+
ALIVE = 'ALIVE'
|
360
|
+
# The job is in a terminal state. (Not necessarily SUCCEEDED.)
|
361
|
+
DONE = 'DONE'
|
362
|
+
|
363
|
+
|
295
364
|
# === Status transition functions ===
|
296
|
-
def
|
365
|
+
def set_job_info(job_id: int, name: str):
|
297
366
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
298
367
|
cursor.execute(
|
299
368
|
"""\
|
300
369
|
INSERT INTO job_info
|
301
|
-
(spot_job_id, name)
|
302
|
-
VALUES (?, ?)""",
|
370
|
+
(spot_job_id, name, schedule_state)
|
371
|
+
VALUES (?, ?, ?)""",
|
372
|
+
(job_id, name, ManagedJobScheduleState.INACTIVE.value))
|
303
373
|
|
304
374
|
|
305
375
|
def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
@@ -324,7 +394,7 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
324
394
|
job_id: The managed job ID.
|
325
395
|
task_id: The task ID.
|
326
396
|
run_timestamp: The run_timestamp of the run. This will be used to
|
327
|
-
|
397
|
+
determine the log directory of the managed task.
|
328
398
|
submit_time: The time when the managed task is submitted.
|
329
399
|
resources_str: The resources string of the managed task.
|
330
400
|
specs: The specs of the managed task.
|
@@ -458,13 +528,12 @@ def set_failed(
|
|
458
528
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
459
529
|
previous_status = cursor.execute(
|
460
530
|
'SELECT status FROM spot WHERE spot_job_id=(?)',
|
461
|
-
(job_id,)).fetchone()
|
462
|
-
previous_status = ManagedJobStatus(previous_status
|
463
|
-
if previous_status
|
464
|
-
# If the job is recovering, we should set the
|
465
|
-
#
|
466
|
-
#
|
467
|
-
# calculation.
|
531
|
+
(job_id,)).fetchone()[0]
|
532
|
+
previous_status = ManagedJobStatus(previous_status)
|
533
|
+
if previous_status == ManagedJobStatus.RECOVERING:
|
534
|
+
# If the job is recovering, we should set the last_recovered_at to
|
535
|
+
# the end_time, so that the end_at - last_recovered_at will not be
|
536
|
+
# affect the job duration calculation.
|
468
537
|
fields_to_set['last_recovered_at'] = end_time
|
469
538
|
set_str = ', '.join(f'{k}=(?)' for k in fields_to_set)
|
470
539
|
task_str = '' if task_id is None else f' AND task_id={task_id}'
|
@@ -564,6 +633,44 @@ def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
|
|
564
633
|
return job_ids
|
565
634
|
|
566
635
|
|
636
|
+
def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
|
637
|
+
"""Get jobs from the database that have a live schedule_state.
|
638
|
+
|
639
|
+
This should return job(s) that are not INACTIVE, WAITING, or DONE. So a
|
640
|
+
returned job should correspond to a live job controller process, with one
|
641
|
+
exception: the job may have just transitioned from WAITING to LAUNCHING, but
|
642
|
+
the controller process has not yet started.
|
643
|
+
"""
|
644
|
+
job_filter = '' if job_id is None else 'AND spot_job_id=(?)'
|
645
|
+
job_value = (job_id,) if job_id is not None else ()
|
646
|
+
|
647
|
+
# Join spot and job_info tables to get the job name for each task.
|
648
|
+
# We use LEFT OUTER JOIN mainly for backward compatibility, as for an
|
649
|
+
# existing controller before #1982, the job_info table may not exist,
|
650
|
+
# and all the managed jobs created before will not present in the
|
651
|
+
# job_info.
|
652
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
653
|
+
rows = cursor.execute(
|
654
|
+
f"""\
|
655
|
+
SELECT spot_job_id, schedule_state, controller_pid
|
656
|
+
FROM job_info
|
657
|
+
WHERE schedule_state not in (?, ?, ?)
|
658
|
+
{job_filter}
|
659
|
+
ORDER BY spot_job_id DESC""",
|
660
|
+
(ManagedJobScheduleState.INACTIVE.value,
|
661
|
+
ManagedJobScheduleState.WAITING.value,
|
662
|
+
ManagedJobScheduleState.DONE.value, *job_value)).fetchall()
|
663
|
+
jobs = []
|
664
|
+
for row in rows:
|
665
|
+
job_dict = {
|
666
|
+
'job_id': row[0],
|
667
|
+
'schedule_state': ManagedJobScheduleState(row[1]),
|
668
|
+
'controller_pid': row[2],
|
669
|
+
}
|
670
|
+
jobs.append(job_dict)
|
671
|
+
return jobs
|
672
|
+
|
673
|
+
|
567
674
|
def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
568
675
|
"""Get all job ids by name."""
|
569
676
|
name_filter = ''
|
@@ -672,6 +779,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
672
779
|
for row in rows:
|
673
780
|
job_dict = dict(zip(columns, row))
|
674
781
|
job_dict['status'] = ManagedJobStatus(job_dict['status'])
|
782
|
+
job_dict['schedule_state'] = ManagedJobScheduleState(
|
783
|
+
job_dict['schedule_state'])
|
675
784
|
if job_dict['job_name'] is None:
|
676
785
|
job_dict['job_name'] = job_dict['task_name']
|
677
786
|
jobs.append(job_dict)
|
@@ -723,3 +832,128 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
|
|
723
832
|
f'SELECT local_log_file FROM spot '
|
724
833
|
f'WHERE {filter_str}', filter_args).fetchone()
|
725
834
|
return local_log_file[-1] if local_log_file else None
|
835
|
+
|
836
|
+
|
837
|
+
# === Scheduler state functions ===
|
838
|
+
# Only the scheduler should call these functions. They may require holding the
|
839
|
+
# scheduler lock to work correctly.
|
840
|
+
|
841
|
+
|
842
|
+
def scheduler_set_waiting(job_id: int, dag_yaml_path: str) -> None:
|
843
|
+
"""Do not call without holding the scheduler lock."""
|
844
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
845
|
+
updated_count = cursor.execute(
|
846
|
+
'UPDATE job_info SET '
|
847
|
+
'schedule_state = (?), dag_yaml_path = (?) '
|
848
|
+
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
849
|
+
(ManagedJobScheduleState.WAITING.value, dag_yaml_path, job_id,
|
850
|
+
ManagedJobScheduleState.INACTIVE.value)).rowcount
|
851
|
+
assert updated_count == 1, (job_id, updated_count)
|
852
|
+
|
853
|
+
|
854
|
+
def scheduler_set_launching(job_id: int,
|
855
|
+
current_state: ManagedJobScheduleState) -> None:
|
856
|
+
"""Do not call without holding the scheduler lock."""
|
857
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
858
|
+
updated_count = cursor.execute(
|
859
|
+
'UPDATE job_info SET '
|
860
|
+
'schedule_state = (?) '
|
861
|
+
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
862
|
+
(ManagedJobScheduleState.LAUNCHING.value, job_id,
|
863
|
+
current_state.value)).rowcount
|
864
|
+
assert updated_count == 1, (job_id, updated_count)
|
865
|
+
|
866
|
+
|
867
|
+
def scheduler_set_alive(job_id: int) -> None:
|
868
|
+
"""Do not call without holding the scheduler lock."""
|
869
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
870
|
+
updated_count = cursor.execute(
|
871
|
+
'UPDATE job_info SET '
|
872
|
+
'schedule_state = (?) '
|
873
|
+
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
874
|
+
(ManagedJobScheduleState.ALIVE.value, job_id,
|
875
|
+
ManagedJobScheduleState.LAUNCHING.value)).rowcount
|
876
|
+
assert updated_count == 1, (job_id, updated_count)
|
877
|
+
|
878
|
+
|
879
|
+
def scheduler_set_alive_waiting(job_id: int) -> None:
|
880
|
+
"""Do not call without holding the scheduler lock."""
|
881
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
882
|
+
updated_count = cursor.execute(
|
883
|
+
'UPDATE job_info SET '
|
884
|
+
'schedule_state = (?) '
|
885
|
+
'WHERE spot_job_id = (?) AND schedule_state = (?)',
|
886
|
+
(ManagedJobScheduleState.ALIVE_WAITING.value, job_id,
|
887
|
+
ManagedJobScheduleState.ALIVE.value)).rowcount
|
888
|
+
assert updated_count == 1, (job_id, updated_count)
|
889
|
+
|
890
|
+
|
891
|
+
def scheduler_set_done(job_id: int, idempotent: bool = False) -> None:
|
892
|
+
"""Do not call without holding the scheduler lock."""
|
893
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
894
|
+
updated_count = cursor.execute(
|
895
|
+
'UPDATE job_info SET '
|
896
|
+
'schedule_state = (?) '
|
897
|
+
'WHERE spot_job_id = (?) AND schedule_state != (?)',
|
898
|
+
(ManagedJobScheduleState.DONE.value, job_id,
|
899
|
+
ManagedJobScheduleState.DONE.value)).rowcount
|
900
|
+
if not idempotent:
|
901
|
+
assert updated_count == 1, (job_id, updated_count)
|
902
|
+
|
903
|
+
|
904
|
+
def set_job_controller_pid(job_id: int, pid: int):
|
905
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
906
|
+
updated_count = cursor.execute(
|
907
|
+
'UPDATE job_info SET '
|
908
|
+
'controller_pid = (?) '
|
909
|
+
'WHERE spot_job_id = (?)', (pid, job_id)).rowcount
|
910
|
+
assert updated_count == 1, (job_id, updated_count)
|
911
|
+
|
912
|
+
|
913
|
+
def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState:
|
914
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
915
|
+
state = cursor.execute(
|
916
|
+
'SELECT schedule_state FROM job_info WHERE spot_job_id = (?)',
|
917
|
+
(job_id,)).fetchone()[0]
|
918
|
+
return ManagedJobScheduleState(state)
|
919
|
+
|
920
|
+
|
921
|
+
def get_num_launching_jobs() -> int:
|
922
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
923
|
+
return cursor.execute(
|
924
|
+
'SELECT COUNT(*) '
|
925
|
+
'FROM job_info '
|
926
|
+
'WHERE schedule_state = (?)',
|
927
|
+
(ManagedJobScheduleState.LAUNCHING.value,)).fetchone()[0]
|
928
|
+
|
929
|
+
|
930
|
+
def get_num_alive_jobs() -> int:
|
931
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
932
|
+
return cursor.execute(
|
933
|
+
'SELECT COUNT(*) '
|
934
|
+
'FROM job_info '
|
935
|
+
'WHERE schedule_state IN (?, ?, ?)',
|
936
|
+
(ManagedJobScheduleState.ALIVE_WAITING.value,
|
937
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
938
|
+
ManagedJobScheduleState.ALIVE.value)).fetchone()[0]
|
939
|
+
|
940
|
+
|
941
|
+
def get_waiting_job() -> Optional[Dict[str, Any]]:
|
942
|
+
"""Get the next job that should transition to LAUNCHING.
|
943
|
+
|
944
|
+
Backwards compatibility note: jobs submitted before #4485 will have no
|
945
|
+
schedule_state and will be ignored by this SQL query.
|
946
|
+
"""
|
947
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
948
|
+
row = cursor.execute(
|
949
|
+
'SELECT spot_job_id, schedule_state, dag_yaml_path '
|
950
|
+
'FROM job_info '
|
951
|
+
'WHERE schedule_state in (?, ?) '
|
952
|
+
'ORDER BY spot_job_id LIMIT 1',
|
953
|
+
(ManagedJobScheduleState.WAITING.value,
|
954
|
+
ManagedJobScheduleState.ALIVE_WAITING.value)).fetchone()
|
955
|
+
return {
|
956
|
+
'job_id': row[0],
|
957
|
+
'schedule_state': ManagedJobScheduleState(row[1]),
|
958
|
+
'dag_yaml_path': row[2],
|
959
|
+
} if row is not None else None
|