skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +50 -67
  3. sky/check.py +31 -1
  4. sky/cli.py +11 -34
  5. sky/clouds/kubernetes.py +3 -3
  6. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  7. sky/core.py +8 -5
  8. sky/data/storage.py +66 -14
  9. sky/global_user_state.py +1 -1
  10. sky/jobs/constants.py +8 -7
  11. sky/jobs/controller.py +19 -22
  12. sky/jobs/core.py +0 -2
  13. sky/jobs/recovery_strategy.py +114 -143
  14. sky/jobs/scheduler.py +283 -0
  15. sky/jobs/state.py +263 -21
  16. sky/jobs/utils.py +338 -96
  17. sky/provision/aws/config.py +48 -26
  18. sky/provision/gcp/instance_utils.py +15 -9
  19. sky/provision/kubernetes/instance.py +1 -1
  20. sky/provision/kubernetes/utils.py +76 -18
  21. sky/resources.py +1 -1
  22. sky/serve/autoscalers.py +359 -301
  23. sky/serve/controller.py +10 -8
  24. sky/serve/core.py +84 -7
  25. sky/serve/load_balancer.py +27 -10
  26. sky/serve/replica_managers.py +1 -3
  27. sky/serve/serve_state.py +10 -5
  28. sky/serve/serve_utils.py +28 -1
  29. sky/serve/service.py +4 -3
  30. sky/serve/service_spec.py +31 -0
  31. sky/skylet/constants.py +1 -1
  32. sky/skylet/events.py +7 -3
  33. sky/skylet/job_lib.py +10 -30
  34. sky/skylet/log_lib.py +8 -8
  35. sky/skylet/log_lib.pyi +3 -0
  36. sky/skylet/skylet.py +1 -1
  37. sky/templates/jobs-controller.yaml.j2 +7 -3
  38. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  39. sky/utils/db_utils.py +18 -4
  40. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  41. sky/utils/resources_utils.py +25 -21
  42. sky/utils/schemas.py +13 -0
  43. sky/utils/subprocess_utils.py +48 -9
  44. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
  45. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
  46. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  47. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
  48. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  49. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/jobs/scheduler.py ADDED
@@ -0,0 +1,283 @@
1
+ """Scheduler for managed jobs.
2
+
3
+ Once managed jobs are submitted via submit_job, the scheduler is responsible for
4
+ the business logic of deciding when they are allowed to start, and choosing the
5
+ right one to start. The scheduler will also schedule jobs that are already live
6
+ but waiting to launch a new task or recover.
7
+
8
+ The scheduler is not its own process - instead, maybe_schedule_next_jobs() can
9
+ be called from any code running on the managed jobs controller instance to
10
+ trigger scheduling of new jobs if possible. This function should be called
11
+ immediately after any state change that could result in jobs newly being able to
12
+ be scheduled.
13
+
14
+ The scheduling logic limits the number of running jobs according to two limits:
15
+ 1. The number of jobs that can be launching (that is, STARTING or RECOVERING) at
16
+ once, based on the number of CPUs. (See _get_launch_parallelism.) This the
17
+ most compute-intensive part of the job lifecycle, which is why we have an
18
+ additional limit.
19
+ 2. The number of jobs that can be running at any given time, based on the amount
20
+ of memory. (See _get_job_parallelism.) Since the job controller is doing very
21
+ little once a job starts (just checking its status periodically), the most
22
+ significant resource it consumes is memory.
23
+
24
+ The state of the scheduler is entirely determined by the schedule_state column
25
+ of all the jobs in the job_info table. This column should only be modified via
26
+ the functions defined in this file. We will always hold the lock while modifying
27
+ this state. See state.ManagedJobScheduleState.
28
+
29
+ Nomenclature:
30
+ - job: same as managed job (may include multiple tasks)
31
+ - launch/launching: launching a cluster (sky.launch) as part of a job
32
+ - start/run: create the job controller process for a job
33
+ - schedule: transition a job to the LAUNCHING state, whether a new job or a job
34
+ that is already alive
35
+ - alive: a job controller exists (includes multiple schedule_states: ALIVE,
36
+ ALIVE_WAITING, LAUNCHING)
37
+ """
38
+
39
+ from argparse import ArgumentParser
40
+ import contextlib
41
+ from functools import lru_cache
42
+ import os
43
+ import time
44
+
45
+ import filelock
46
+ import psutil
47
+
48
+ from sky import sky_logging
49
+ from sky.jobs import constants as managed_job_constants
50
+ from sky.jobs import state
51
+ from sky.skylet import constants
52
+ from sky.utils import subprocess_utils
53
+
54
+ logger = sky_logging.init_logger('sky.jobs.controller')
55
+
56
+ # The _MANAGED_JOB_SCHEDULER_LOCK should be held whenever we are checking the
57
+ # parallelism control or updating the schedule_state of any job.
58
+ # Any code that takes this lock must conclude by calling
59
+ # maybe_schedule_next_jobs.
60
+ _MANAGED_JOB_SCHEDULER_LOCK = '~/.sky/locks/managed_job_scheduler.lock'
61
+ _ALIVE_JOB_LAUNCH_WAIT_INTERVAL = 0.5
62
+
63
+
64
+ @lru_cache(maxsize=1)
65
+ def _get_lock_path() -> str:
66
+ path = os.path.expanduser(_MANAGED_JOB_SCHEDULER_LOCK)
67
+ os.makedirs(os.path.dirname(path), exist_ok=True)
68
+ return path
69
+
70
+
71
+ def maybe_schedule_next_jobs() -> None:
72
+ """Determine if any managed jobs can be scheduled, and if so, schedule them.
73
+
74
+ Here, "schedule" means to select job that is waiting, and allow it to
75
+ proceed. It does NOT mean to submit a job to the scheduler.
76
+
77
+ For newly submitted jobs, scheduling means updating the state of the jobs,
78
+ and starting the job controller process. For jobs that are already alive but
79
+ are waiting to launch a new task or recover, just update the state of the
80
+ job to indicate that the launch can proceed.
81
+
82
+ This function transitions jobs into LAUNCHING on a best-effort basis. That
83
+ is, if we can start any jobs, we will, but if not, we will exit (almost)
84
+ immediately. It's expected that if some WAITING or ALIVE_WAITING jobs cannot
85
+ be started now (either because the lock is held, or because there are not
86
+ enough resources), another call to this function will be made whenever that
87
+ situation is resolved. (If the lock is held, the lock holder should start
88
+ the jobs. If there aren't enough resources, the next controller to exit and
89
+ free up resources should start the jobs.)
90
+
91
+ If this function obtains the lock, it will launch as many jobs as possible
92
+ before releasing the lock. This is what allows other calls to exit
93
+ immediately if the lock is held, while ensuring that all jobs are started as
94
+ soon as possible.
95
+
96
+ This uses subprocess_utils.launch_new_process_tree() to start the controller
97
+ processes, which should be safe to call from pretty much any code running on
98
+ the jobs controller instance. New job controller processes will be detached
99
+ from the current process and there will not be a parent/child relationship.
100
+ See launch_new_process_tree for more.
101
+ """
102
+ try:
103
+ # We must use a global lock rather than a per-job lock to ensure correct
104
+ # parallelism control. If we cannot obtain the lock, exit immediately.
105
+ # The current lock holder is expected to launch any jobs it can before
106
+ # releasing the lock.
107
+ with filelock.FileLock(_get_lock_path(), blocking=False):
108
+ while True:
109
+ maybe_next_job = state.get_waiting_job()
110
+ if maybe_next_job is None:
111
+ # Nothing left to start, break from scheduling loop
112
+ break
113
+
114
+ current_state = maybe_next_job['schedule_state']
115
+
116
+ assert current_state in (
117
+ state.ManagedJobScheduleState.ALIVE_WAITING,
118
+ state.ManagedJobScheduleState.WAITING), maybe_next_job
119
+
120
+ # Note: we expect to get ALIVE_WAITING jobs before WAITING jobs,
121
+ # since they will have been submitted and therefore started
122
+ # first. The requirements to launch in an alive job are more
123
+ # lenient, so there is no way that we wouldn't be able to launch
124
+ # an ALIVE_WAITING job, but we would be able to launch a WAITING
125
+ # job.
126
+ if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
127
+ if not _can_lauch_in_alive_job():
128
+ # Can't schedule anything, break from scheduling loop.
129
+ break
130
+ elif current_state == state.ManagedJobScheduleState.WAITING:
131
+ if not _can_start_new_job():
132
+ # Can't schedule anything, break from scheduling loop.
133
+ break
134
+
135
+ logger.debug(f'Scheduling job {maybe_next_job["job_id"]}')
136
+ state.scheduler_set_launching(maybe_next_job['job_id'],
137
+ current_state)
138
+
139
+ if current_state == state.ManagedJobScheduleState.WAITING:
140
+ # The job controller has not been started yet. We must start
141
+ # it.
142
+
143
+ job_id = maybe_next_job['job_id']
144
+ dag_yaml_path = maybe_next_job['dag_yaml_path']
145
+
146
+ # If the command line here is changed, please also update
147
+ # utils._controller_process_alive. `--job-id X` should be at
148
+ # the end.
149
+ run_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};'
150
+ 'python -u -m sky.jobs.controller '
151
+ f'{dag_yaml_path} --job-id {job_id}')
152
+
153
+ logs_dir = os.path.expanduser(
154
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
155
+ os.makedirs(logs_dir, exist_ok=True)
156
+ log_path = os.path.join(logs_dir, f'{job_id}.log')
157
+
158
+ pid = subprocess_utils.launch_new_process_tree(
159
+ run_cmd, log_output=log_path)
160
+ state.set_job_controller_pid(job_id, pid)
161
+
162
+ logger.debug(f'Job {job_id} started with pid {pid}')
163
+
164
+ except filelock.Timeout:
165
+ # If we can't get the lock, just exit. The process holding the lock
166
+ # should launch any pending jobs.
167
+ pass
168
+
169
+
170
+ def submit_job(job_id: int, dag_yaml_path: str) -> None:
171
+ """Submit an existing job to the scheduler.
172
+
173
+ This should be called after a job is created in the `spot` table as
174
+ PENDING. It will tell the scheduler to try and start the job controller, if
175
+ there are resources available. It may block to acquire the lock, so it
176
+ should not be on the critical path for `sky jobs launch -d`.
177
+ """
178
+ with filelock.FileLock(_get_lock_path()):
179
+ state.scheduler_set_waiting(job_id, dag_yaml_path)
180
+ maybe_schedule_next_jobs()
181
+
182
+
183
+ @contextlib.contextmanager
184
+ def scheduled_launch(job_id: int):
185
+ """Launch as part of an ongoing job.
186
+
187
+ A newly started job will already be LAUNCHING, and this will immediately
188
+ enter the context.
189
+
190
+ If a job is ongoing (ALIVE schedule_state), there are two scenarios where we
191
+ may need to call sky.launch again during the course of a job controller:
192
+ - for tasks after the first task
193
+ - for recovery
194
+
195
+ This function will mark the job as ALIVE_WAITING, which indicates to the
196
+ scheduler that it wants to transition back to LAUNCHING. Then, it will wait
197
+ until the scheduler transitions the job state, before entering the context.
198
+
199
+ On exiting the context, the job will transition to ALIVE.
200
+
201
+ This should only be used within the job controller for the given job_id. If
202
+ multiple uses of this context are nested, behavior is undefined. Don't do
203
+ that.
204
+ """
205
+
206
+ # If we're already in LAUNCHING schedule_state, we don't need to wait.
207
+ # This may be the case for the first launch of a job.
208
+ if (state.get_job_schedule_state(job_id) !=
209
+ state.ManagedJobScheduleState.LAUNCHING):
210
+ # Since we aren't LAUNCHING, we need to wait to be scheduled.
211
+ _set_alive_waiting(job_id)
212
+
213
+ while (state.get_job_schedule_state(job_id) !=
214
+ state.ManagedJobScheduleState.LAUNCHING):
215
+ time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
216
+
217
+ yield
218
+
219
+ with filelock.FileLock(_get_lock_path()):
220
+ state.scheduler_set_alive(job_id)
221
+ maybe_schedule_next_jobs()
222
+
223
+
224
+ def job_done(job_id: int, idempotent: bool = False) -> None:
225
+ """Transition a job to DONE.
226
+
227
+ If idempotent is True, this will not raise an error if the job is already
228
+ DONE.
229
+
230
+ The job could be in any terminal ManagedJobStatus. However, once DONE, it
231
+ should never transition back to another state.
232
+ """
233
+ if idempotent and (state.get_job_schedule_state(job_id)
234
+ == state.ManagedJobScheduleState.DONE):
235
+ return
236
+
237
+ with filelock.FileLock(_get_lock_path()):
238
+ state.scheduler_set_done(job_id, idempotent)
239
+ maybe_schedule_next_jobs()
240
+
241
+
242
+ def _set_alive_waiting(job_id: int) -> None:
243
+ """Should use wait_until_launch_okay() to transition to this state."""
244
+ with filelock.FileLock(_get_lock_path()):
245
+ state.scheduler_set_alive_waiting(job_id)
246
+ maybe_schedule_next_jobs()
247
+
248
+
249
+ def _get_job_parallelism() -> int:
250
+ # Assume a running job uses 350MB memory.
251
+ # We observe 230-300 in practice.
252
+ job_memory = 350 * 1024 * 1024
253
+ return max(psutil.virtual_memory().total // job_memory, 1)
254
+
255
+
256
+ def _get_launch_parallelism() -> int:
257
+ cpus = os.cpu_count()
258
+ return cpus * 4 if cpus is not None else 1
259
+
260
+
261
+ def _can_start_new_job() -> bool:
262
+ launching_jobs = state.get_num_launching_jobs()
263
+ alive_jobs = state.get_num_alive_jobs()
264
+ return launching_jobs < _get_launch_parallelism(
265
+ ) and alive_jobs < _get_job_parallelism()
266
+
267
+
268
+ def _can_lauch_in_alive_job() -> bool:
269
+ launching_jobs = state.get_num_launching_jobs()
270
+ return launching_jobs < _get_launch_parallelism()
271
+
272
+
273
+ if __name__ == '__main__':
274
+ parser = ArgumentParser()
275
+ parser.add_argument('--job-id',
276
+ required=True,
277
+ type=int,
278
+ help='Job id for the controller job.')
279
+ parser.add_argument('dag_yaml',
280
+ type=str,
281
+ help='The path to the user job yaml file.')
282
+ args = parser.parse_args()
283
+ submit_job(args.job_id, args.dag_yaml)
sky/jobs/state.py CHANGED
@@ -107,12 +107,25 @@ def create_table(cursor, conn):
107
107
  db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
108
108
  'TEXT DEFAULT NULL')
109
109
 
110
- # `job_info` contains the mapping from job_id to the job_name.
111
- # In the future, it may contain more information about each job.
110
+ # `job_info` contains the mapping from job_id to the job_name, as well as
111
+ # information used by the scheduler.
112
112
  cursor.execute("""\
113
113
  CREATE TABLE IF NOT EXISTS job_info (
114
114
  spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
115
- name TEXT)""")
115
+ name TEXT,
116
+ schedule_state TEXT,
117
+ controller_pid INTEGER DEFAULT NULL,
118
+ dag_yaml_path TEXT)""")
119
+
120
+ db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
121
+ 'TEXT')
122
+
123
+ db_utils.add_column_to_table(cursor, conn, 'job_info', 'controller_pid',
124
+ 'INTEGER DEFAULT NULL')
125
+
126
+ db_utils.add_column_to_table(cursor, conn, 'job_info', 'dag_yaml_path',
127
+ 'TEXT')
128
+
116
129
  conn.commit()
117
130
 
118
131
 
@@ -164,6 +177,9 @@ columns = [
164
177
  # columns from the job_info table
165
178
  '_job_info_job_id', # This should be the same as job_id
166
179
  'job_name',
180
+ 'schedule_state',
181
+ 'controller_pid',
182
+ 'dag_yaml_path',
167
183
  ]
168
184
 
169
185
 
@@ -189,16 +205,18 @@ class ManagedJobStatus(enum.Enum):
189
205
  SUCCEEDED -> SUCCEEDED
190
206
  FAILED -> FAILED
191
207
  FAILED_SETUP -> FAILED_SETUP
208
+ Not all statuses are in this list, since some ManagedJobStatuses are only
209
+ possible while the cluster is INIT/STOPPED/not yet UP.
192
210
  Note that the JobStatus will not be stuck in PENDING, because each cluster
193
211
  is dedicated to a managed job, i.e. there should always be enough resource
194
212
  to run the job and the job will be immediately transitioned to RUNNING.
213
+
195
214
  """
196
215
  # PENDING: Waiting for the jobs controller to have a slot to run the
197
216
  # controller process.
198
- # The submitted_at timestamp of the managed job in the 'spot' table will be
199
- # set to the time when the job is firstly submitted by the user (set to
200
- # PENDING).
201
217
  PENDING = 'PENDING'
218
+ # The submitted_at timestamp of the managed job in the 'spot' table will be
219
+ # set to the time when the job controller begins running.
202
220
  # SUBMITTED: The jobs controller starts the controller process.
203
221
  SUBMITTED = 'SUBMITTED'
204
222
  # STARTING: The controller process is launching the cluster for the managed
@@ -292,14 +310,72 @@ _SPOT_STATUS_TO_COLOR = {
292
310
  }
293
311
 
294
312
 
313
+ class ManagedJobScheduleState(enum.Enum):
314
+ """Captures the state of the job from the scheduler's perspective.
315
+
316
+ A job that predates the introduction of the scheduler will be INVALID.
317
+
318
+ A newly created job will be INACTIVE. The following transitions are valid:
319
+ - INACTIVE -> WAITING: The job is "submitted" to the scheduler, and its job
320
+ controller can be started.
321
+ - WAITING -> LAUNCHING: The job controller is starting by the scheduler and
322
+ may proceed to sky.launch.
323
+ - LAUNCHING -> ALIVE: The launch attempt was completed. It may have
324
+ succeeded or failed. The job controller is not allowed to sky.launch again
325
+ without transitioning to ALIVE_WAITING and then LAUNCHING.
326
+ - ALIVE -> ALIVE_WAITING: The job controller wants to sky.launch again,
327
+ either for recovery or to launch a subsequent task.
328
+ - ALIVE_WAITING -> LAUNCHING: The scheduler has determined that the job
329
+ controller may launch again.
330
+ - LAUNCHING, ALIVE, or ALIVE_WAITING -> DONE: The job controller is exiting
331
+ and the job is in some terminal status. In the future it may be possible
332
+ to transition directly from WAITING or even INACTIVE to DONE if the job is
333
+ cancelled.
334
+
335
+ There is no well-defined mapping from the managed job status to schedule
336
+ state or vice versa. (In fact, schedule state is defined on the job and
337
+ status on the task.)
338
+ - INACTIVE or WAITING should only be seen when a job is PENDING.
339
+ - ALIVE_WAITING should only be seen when a job is RECOVERING, has multiple
340
+ tasks, or needs to retry launching.
341
+ - LAUNCHING and ALIVE can be seen in many different statuses.
342
+ - DONE should only be seen when a job is in a terminal status.
343
+ Since state and status transitions are not atomic, it may be possible to
344
+ briefly observe inconsistent states, like a job that just finished but
345
+ hasn't yet transitioned to DONE.
346
+ """
347
+ # This job may have been created before scheduler was introduced in #4458.
348
+ # This state is not used by scheduler but just for backward compatibility.
349
+ # TODO(cooperc): remove this in v0.11.0
350
+ INVALID = None
351
+ # The job should be ignored by the scheduler.
352
+ INACTIVE = 'INACTIVE'
353
+ # The job is waiting to transition to LAUNCHING for the first time. The
354
+ # scheduler should try to transition it, and when it does, it should start
355
+ # the job controller.
356
+ WAITING = 'WAITING'
357
+ # The job is already alive, but wants to transition back to LAUNCHING,
358
+ # e.g. for recovery, or launching later tasks in the DAG. The scheduler
359
+ # should try to transition it to LAUNCHING.
360
+ ALIVE_WAITING = 'ALIVE_WAITING'
361
+ # The job is running sky.launch, or soon will, using a limited number of
362
+ # allowed launch slots.
363
+ LAUNCHING = 'LAUNCHING'
364
+ # The controller for the job is running, but it's not currently launching.
365
+ ALIVE = 'ALIVE'
366
+ # The job is in a terminal state. (Not necessarily SUCCEEDED.)
367
+ DONE = 'DONE'
368
+
369
+
295
370
  # === Status transition functions ===
296
- def set_job_name(job_id: int, name: str):
371
+ def set_job_info(job_id: int, name: str):
297
372
  with db_utils.safe_cursor(_DB_PATH) as cursor:
298
373
  cursor.execute(
299
374
  """\
300
375
  INSERT INTO job_info
301
- (spot_job_id, name)
302
- VALUES (?, ?)""", (job_id, name))
376
+ (spot_job_id, name, schedule_state)
377
+ VALUES (?, ?, ?)""",
378
+ (job_id, name, ManagedJobScheduleState.INACTIVE.value))
303
379
 
304
380
 
305
381
  def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
@@ -324,7 +400,7 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
324
400
  job_id: The managed job ID.
325
401
  task_id: The task ID.
326
402
  run_timestamp: The run_timestamp of the run. This will be used to
327
- determine the log directory of the managed task.
403
+ determine the log directory of the managed task.
328
404
  submit_time: The time when the managed task is submitted.
329
405
  resources_str: The resources string of the managed task.
330
406
  specs: The specs of the managed task.
@@ -458,13 +534,12 @@ def set_failed(
458
534
  with db_utils.safe_cursor(_DB_PATH) as cursor:
459
535
  previous_status = cursor.execute(
460
536
  'SELECT status FROM spot WHERE spot_job_id=(?)',
461
- (job_id,)).fetchone()
462
- previous_status = ManagedJobStatus(previous_status[0])
463
- if previous_status in [ManagedJobStatus.RECOVERING]:
464
- # If the job is recovering, we should set the
465
- # last_recovered_at to the end_time, so that the
466
- # end_at - last_recovered_at will not be affect the job duration
467
- # calculation.
537
+ (job_id,)).fetchone()[0]
538
+ previous_status = ManagedJobStatus(previous_status)
539
+ if previous_status == ManagedJobStatus.RECOVERING:
540
+ # If the job is recovering, we should set the last_recovered_at to
541
+ # the end_time, so that the end_at - last_recovered_at will not be
542
+ # affect the job duration calculation.
468
543
  fields_to_set['last_recovered_at'] = end_time
469
544
  set_str = ', '.join(f'{k}=(?)' for k in fields_to_set)
470
545
  task_str = '' if task_id is None else f' AND task_id={task_id}'
@@ -564,6 +639,44 @@ def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
564
639
  return job_ids
565
640
 
566
641
 
642
+ def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
643
+ """Get jobs from the database that have a live schedule_state.
644
+
645
+ This should return job(s) that are not INACTIVE, WAITING, or DONE. So a
646
+ returned job should correspond to a live job controller process, with one
647
+ exception: the job may have just transitioned from WAITING to LAUNCHING, but
648
+ the controller process has not yet started.
649
+ """
650
+ job_filter = '' if job_id is None else 'AND spot_job_id=(?)'
651
+ job_value = (job_id,) if job_id is not None else ()
652
+
653
+ # Join spot and job_info tables to get the job name for each task.
654
+ # We use LEFT OUTER JOIN mainly for backward compatibility, as for an
655
+ # existing controller before #1982, the job_info table may not exist,
656
+ # and all the managed jobs created before will not present in the
657
+ # job_info.
658
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
659
+ rows = cursor.execute(
660
+ f"""\
661
+ SELECT spot_job_id, schedule_state, controller_pid
662
+ FROM job_info
663
+ WHERE schedule_state not in (?, ?, ?)
664
+ {job_filter}
665
+ ORDER BY spot_job_id DESC""",
666
+ (ManagedJobScheduleState.INACTIVE.value,
667
+ ManagedJobScheduleState.WAITING.value,
668
+ ManagedJobScheduleState.DONE.value, *job_value)).fetchall()
669
+ jobs = []
670
+ for row in rows:
671
+ job_dict = {
672
+ 'job_id': row[0],
673
+ 'schedule_state': ManagedJobScheduleState(row[1]),
674
+ 'controller_pid': row[2],
675
+ }
676
+ jobs.append(job_dict)
677
+ return jobs
678
+
679
+
567
680
  def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
568
681
  """Get all job ids by name."""
569
682
  name_filter = ''
@@ -620,10 +733,12 @@ def get_latest_task_id_status(
620
733
  id_statuses = _get_all_task_ids_statuses(job_id)
621
734
  if not id_statuses:
622
735
  return None, None
623
- task_id, status = id_statuses[-1]
624
- for task_id, status in id_statuses:
625
- if not status.is_terminal():
626
- break
736
+ task_id, status = next(
737
+ ((tid, st) for tid, st in id_statuses if not st.is_terminal()),
738
+ id_statuses[-1],
739
+ )
740
+ # Unpack the tuple first, or it triggers a Pylint's bug on recognizing
741
+ # the return type.
627
742
  return task_id, status
628
743
 
629
744
 
@@ -670,6 +785,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
670
785
  for row in rows:
671
786
  job_dict = dict(zip(columns, row))
672
787
  job_dict['status'] = ManagedJobStatus(job_dict['status'])
788
+ job_dict['schedule_state'] = ManagedJobScheduleState(
789
+ job_dict['schedule_state'])
673
790
  if job_dict['job_name'] is None:
674
791
  job_dict['job_name'] = job_dict['task_name']
675
792
  jobs.append(job_dict)
@@ -721,3 +838,128 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
721
838
  f'SELECT local_log_file FROM spot '
722
839
  f'WHERE {filter_str}', filter_args).fetchone()
723
840
  return local_log_file[-1] if local_log_file else None
841
+
842
+
843
+ # === Scheduler state functions ===
844
+ # Only the scheduler should call these functions. They may require holding the
845
+ # scheduler lock to work correctly.
846
+
847
+
848
+ def scheduler_set_waiting(job_id: int, dag_yaml_path: str) -> None:
849
+ """Do not call without holding the scheduler lock."""
850
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
851
+ updated_count = cursor.execute(
852
+ 'UPDATE job_info SET '
853
+ 'schedule_state = (?), dag_yaml_path = (?) '
854
+ 'WHERE spot_job_id = (?) AND schedule_state = (?)',
855
+ (ManagedJobScheduleState.WAITING.value, dag_yaml_path, job_id,
856
+ ManagedJobScheduleState.INACTIVE.value)).rowcount
857
+ assert updated_count == 1, (job_id, updated_count)
858
+
859
+
860
+ def scheduler_set_launching(job_id: int,
861
+ current_state: ManagedJobScheduleState) -> None:
862
+ """Do not call without holding the scheduler lock."""
863
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
864
+ updated_count = cursor.execute(
865
+ 'UPDATE job_info SET '
866
+ 'schedule_state = (?) '
867
+ 'WHERE spot_job_id = (?) AND schedule_state = (?)',
868
+ (ManagedJobScheduleState.LAUNCHING.value, job_id,
869
+ current_state.value)).rowcount
870
+ assert updated_count == 1, (job_id, updated_count)
871
+
872
+
873
+ def scheduler_set_alive(job_id: int) -> None:
874
+ """Do not call without holding the scheduler lock."""
875
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
876
+ updated_count = cursor.execute(
877
+ 'UPDATE job_info SET '
878
+ 'schedule_state = (?) '
879
+ 'WHERE spot_job_id = (?) AND schedule_state = (?)',
880
+ (ManagedJobScheduleState.ALIVE.value, job_id,
881
+ ManagedJobScheduleState.LAUNCHING.value)).rowcount
882
+ assert updated_count == 1, (job_id, updated_count)
883
+
884
+
885
+ def scheduler_set_alive_waiting(job_id: int) -> None:
886
+ """Do not call without holding the scheduler lock."""
887
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
888
+ updated_count = cursor.execute(
889
+ 'UPDATE job_info SET '
890
+ 'schedule_state = (?) '
891
+ 'WHERE spot_job_id = (?) AND schedule_state = (?)',
892
+ (ManagedJobScheduleState.ALIVE_WAITING.value, job_id,
893
+ ManagedJobScheduleState.ALIVE.value)).rowcount
894
+ assert updated_count == 1, (job_id, updated_count)
895
+
896
+
897
+ def scheduler_set_done(job_id: int, idempotent: bool = False) -> None:
898
+ """Do not call without holding the scheduler lock."""
899
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
900
+ updated_count = cursor.execute(
901
+ 'UPDATE job_info SET '
902
+ 'schedule_state = (?) '
903
+ 'WHERE spot_job_id = (?) AND schedule_state != (?)',
904
+ (ManagedJobScheduleState.DONE.value, job_id,
905
+ ManagedJobScheduleState.DONE.value)).rowcount
906
+ if not idempotent:
907
+ assert updated_count == 1, (job_id, updated_count)
908
+
909
+
910
+ def set_job_controller_pid(job_id: int, pid: int):
911
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
912
+ updated_count = cursor.execute(
913
+ 'UPDATE job_info SET '
914
+ 'controller_pid = (?) '
915
+ 'WHERE spot_job_id = (?)', (pid, job_id)).rowcount
916
+ assert updated_count == 1, (job_id, updated_count)
917
+
918
+
919
+ def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState:
920
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
921
+ state = cursor.execute(
922
+ 'SELECT schedule_state FROM job_info WHERE spot_job_id = (?)',
923
+ (job_id,)).fetchone()[0]
924
+ return ManagedJobScheduleState(state)
925
+
926
+
927
+ def get_num_launching_jobs() -> int:
928
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
929
+ return cursor.execute(
930
+ 'SELECT COUNT(*) '
931
+ 'FROM job_info '
932
+ 'WHERE schedule_state = (?)',
933
+ (ManagedJobScheduleState.LAUNCHING.value,)).fetchone()[0]
934
+
935
+
936
+ def get_num_alive_jobs() -> int:
937
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
938
+ return cursor.execute(
939
+ 'SELECT COUNT(*) '
940
+ 'FROM job_info '
941
+ 'WHERE schedule_state IN (?, ?, ?)',
942
+ (ManagedJobScheduleState.ALIVE_WAITING.value,
943
+ ManagedJobScheduleState.LAUNCHING.value,
944
+ ManagedJobScheduleState.ALIVE.value)).fetchone()[0]
945
+
946
+
947
+ def get_waiting_job() -> Optional[Dict[str, Any]]:
948
+ """Get the next job that should transition to LAUNCHING.
949
+
950
+ Backwards compatibility note: jobs submitted before #4485 will have no
951
+ schedule_state and will be ignored by this SQL query.
952
+ """
953
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
954
+ row = cursor.execute(
955
+ 'SELECT spot_job_id, schedule_state, dag_yaml_path '
956
+ 'FROM job_info '
957
+ 'WHERE schedule_state in (?, ?) '
958
+ 'ORDER BY spot_job_id LIMIT 1',
959
+ (ManagedJobScheduleState.WAITING.value,
960
+ ManagedJobScheduleState.ALIVE_WAITING.value)).fetchone()
961
+ return {
962
+ 'job_id': row[0],
963
+ 'schedule_state': ManagedJobScheduleState(row[1]),
964
+ 'dag_yaml_path': row[2],
965
+ } if row is not None else None