skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +35 -1
  4. sky/backends/cloud_vm_ray_backend.py +2 -2
  5. sky/client/sdk.py +20 -0
  6. sky/client/sdk_async.py +18 -16
  7. sky/clouds/aws.py +3 -1
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/storage.py +5 -1
  26. sky/execution.py +21 -14
  27. sky/jobs/constants.py +3 -0
  28. sky/jobs/controller.py +732 -310
  29. sky/jobs/recovery_strategy.py +251 -129
  30. sky/jobs/scheduler.py +247 -174
  31. sky/jobs/server/core.py +20 -4
  32. sky/jobs/server/utils.py +2 -2
  33. sky/jobs/state.py +702 -511
  34. sky/jobs/utils.py +94 -39
  35. sky/provision/aws/config.py +4 -1
  36. sky/provision/gcp/config.py +6 -1
  37. sky/provision/kubernetes/utils.py +17 -8
  38. sky/provision/provisioner.py +1 -0
  39. sky/serve/replica_managers.py +0 -7
  40. sky/serve/serve_utils.py +5 -0
  41. sky/serve/server/impl.py +1 -2
  42. sky/serve/service.py +0 -2
  43. sky/server/common.py +8 -3
  44. sky/server/config.py +43 -24
  45. sky/server/constants.py +1 -0
  46. sky/server/daemons.py +7 -11
  47. sky/server/requests/serializers/encoders.py +1 -1
  48. sky/server/server.py +8 -1
  49. sky/setup_files/dependencies.py +4 -2
  50. sky/skylet/attempt_skylet.py +1 -0
  51. sky/skylet/constants.py +3 -1
  52. sky/skylet/events.py +2 -10
  53. sky/utils/command_runner.pyi +3 -3
  54. sky/utils/common_utils.py +11 -1
  55. sky/utils/controller_utils.py +5 -0
  56. sky/utils/db/db_utils.py +31 -2
  57. sky/utils/rich_utils.py +3 -1
  58. sky/utils/subprocess_utils.py +9 -0
  59. sky/volumes/volume.py +2 -0
  60. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
  61. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
  62. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
  63. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
  64. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
  65. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
  66. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
  67. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -1,13 +1,17 @@
1
1
  """The database for managed jobs status."""
2
2
  # TODO(zhwu): maybe use file based status instead of database, so
3
3
  # that we can easily switch to a s3-based storage.
4
+ import asyncio
4
5
  import enum
5
6
  import functools
7
+ import ipaddress
6
8
  import json
9
+ import sqlite3
7
10
  import threading
8
11
  import time
9
12
  import typing
10
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
13
+ from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
14
+ import urllib.parse
11
15
 
12
16
  import colorama
13
17
  import sqlalchemy
@@ -15,27 +19,34 @@ from sqlalchemy import exc as sqlalchemy_exc
15
19
  from sqlalchemy import orm
16
20
  from sqlalchemy.dialects import postgresql
17
21
  from sqlalchemy.dialects import sqlite
22
+ from sqlalchemy.ext import asyncio as sql_async
18
23
  from sqlalchemy.ext import declarative
19
24
 
20
25
  from sky import exceptions
21
26
  from sky import sky_logging
27
+ from sky import skypilot_config
22
28
  from sky.skylet import constants
23
29
  from sky.utils import common_utils
30
+ from sky.utils import context_utils
24
31
  from sky.utils.db import db_utils
25
32
  from sky.utils.db import migration_utils
26
33
 
27
34
  if typing.TYPE_CHECKING:
28
35
  from sqlalchemy.engine import row
29
36
 
30
- import sky
31
-
32
- CallbackType = Callable[[str], None]
37
+ # Separate callback types for sync and async contexts
38
+ SyncCallbackType = Callable[[str], None]
39
+ AsyncCallbackType = Callable[[str], Awaitable[Any]]
40
+ CallbackType = Union[SyncCallbackType, AsyncCallbackType]
33
41
 
34
42
  logger = sky_logging.init_logger(__name__)
35
43
 
36
44
  _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
45
+ _SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
37
46
  _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
38
47
 
48
+ _DB_RETRY_TIMES = 30
49
+
39
50
  Base = declarative.declarative_base()
40
51
 
41
52
  # === Database schema ===
@@ -70,7 +81,7 @@ spot_table = sqlalchemy.Table(
70
81
  sqlalchemy.Column('recovery_count', sqlalchemy.Integer, server_default='0'),
71
82
  sqlalchemy.Column('job_duration', sqlalchemy.Float, server_default='0'),
72
83
  sqlalchemy.Column('failure_reason', sqlalchemy.Text),
73
- sqlalchemy.Column('spot_job_id', sqlalchemy.Integer),
84
+ sqlalchemy.Column('spot_job_id', sqlalchemy.Integer, index=True),
74
85
  sqlalchemy.Column('task_id', sqlalchemy.Integer, server_default='0'),
75
86
  sqlalchemy.Column('task_name', sqlalchemy.Text),
76
87
  sqlalchemy.Column('specs', sqlalchemy.Text),
@@ -129,6 +140,7 @@ def create_table(engine: sqlalchemy.engine.Engine):
129
140
  try:
130
141
  with orm.Session(engine) as session:
131
142
  session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
143
+ session.execute(sqlalchemy.text('PRAGMA synchronous=1'))
132
144
  session.commit()
133
145
  except sqlalchemy_exc.OperationalError as e:
134
146
  if 'database is locked' not in str(e):
@@ -141,6 +153,43 @@ def create_table(engine: sqlalchemy.engine.Engine):
141
153
  migration_utils.SPOT_JOBS_VERSION)
142
154
 
143
155
 
156
+ def force_no_postgres() -> bool:
157
+ """Force no postgres.
158
+
159
+ If the db is localhost on the api server, and we are not in consolidation
160
+ mode, we must force using sqlite and not using the api server on the jobs
161
+ controller.
162
+ """
163
+ conn_string = skypilot_config.get_nested(('db',), None)
164
+
165
+ if conn_string:
166
+ parsed = urllib.parse.urlparse(conn_string)
167
+ # it freezes if we use the normal get_consolidation_mode function
168
+ consolidation_mode = skypilot_config.get_nested(
169
+ ('jobs', 'controller', 'consolidation_mode'), default_value=False)
170
+ if ((parsed.hostname == 'localhost' or
171
+ ipaddress.ip_address(parsed.hostname).is_loopback) and
172
+ not consolidation_mode):
173
+ return True
174
+ return False
175
+
176
+
177
+ def initialize_and_get_db_async() -> sql_async.AsyncEngine:
178
+ global _SQLALCHEMY_ENGINE_ASYNC
179
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
180
+ return _SQLALCHEMY_ENGINE_ASYNC
181
+ with _SQLALCHEMY_ENGINE_LOCK:
182
+ if _SQLALCHEMY_ENGINE_ASYNC is not None:
183
+ return _SQLALCHEMY_ENGINE_ASYNC
184
+
185
+ _SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('spot_jobs',
186
+ async_engine=True)
187
+
188
+ # to create the table in case an async function gets called first
189
+ initialize_and_get_db()
190
+ return _SQLALCHEMY_ENGINE_ASYNC
191
+
192
+
144
193
  # We wrap the sqlalchemy engine initialization in a thread
145
194
  # lock to ensure that multiple threads do not initialize the
146
195
  # engine which could result in a rare race condition where
@@ -149,7 +198,6 @@ def create_table(engine: sqlalchemy.engine.Engine):
149
198
  # which could result in e1 being garbage collected unexpectedly.
150
199
  def initialize_and_get_db() -> sqlalchemy.engine.Engine:
151
200
  global _SQLALCHEMY_ENGINE
152
-
153
201
  if _SQLALCHEMY_ENGINE is not None:
154
202
  return _SQLALCHEMY_ENGINE
155
203
 
@@ -167,13 +215,58 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
167
215
  return _SQLALCHEMY_ENGINE
168
216
 
169
217
 
218
+ def _init_db_async(func):
219
+ """Initialize the async database. Add backoff to the function call."""
220
+
221
+ @functools.wraps(func)
222
+ async def wrapper(*args, **kwargs):
223
+ if _SQLALCHEMY_ENGINE_ASYNC is None:
224
+ # this may happen multiple times since there is no locking
225
+ # here but thats fine, this is just a short circuit for the
226
+ # common case.
227
+ await context_utils.to_thread(initialize_and_get_db_async)
228
+
229
+ backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5)
230
+ last_exc = None
231
+ for _ in range(_DB_RETRY_TIMES):
232
+ try:
233
+ return await func(*args, **kwargs)
234
+ except (sqlalchemy_exc.OperationalError,
235
+ asyncio.exceptions.TimeoutError, OSError,
236
+ sqlalchemy_exc.TimeoutError, sqlite3.OperationalError,
237
+ sqlalchemy_exc.InterfaceError, sqlite3.InterfaceError) as e:
238
+ last_exc = e
239
+ logger.debug(f'DB error: {last_exc}')
240
+ await asyncio.sleep(backoff.current_backoff())
241
+ raise last_exc
242
+
243
+ return wrapper
244
+
245
+
170
246
  def _init_db(func):
171
- """Initialize the database."""
247
+ """Initialize the database. Add backoff to the function call."""
172
248
 
173
249
  @functools.wraps(func)
174
250
  def wrapper(*args, **kwargs):
175
- initialize_and_get_db()
176
- return func(*args, **kwargs)
251
+ if _SQLALCHEMY_ENGINE is None:
252
+ # this may happen multiple times since there is no locking
253
+ # here but thats fine, this is just a short circuit for the
254
+ # common case.
255
+ initialize_and_get_db()
256
+
257
+ backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=10)
258
+ last_exc = None
259
+ for _ in range(_DB_RETRY_TIMES):
260
+ try:
261
+ return func(*args, **kwargs)
262
+ except (sqlalchemy_exc.OperationalError,
263
+ asyncio.exceptions.TimeoutError, OSError,
264
+ sqlalchemy_exc.TimeoutError, sqlite3.OperationalError,
265
+ sqlalchemy_exc.InterfaceError, sqlite3.InterfaceError) as e:
266
+ last_exc = e
267
+ logger.debug(f'DB error: {last_exc}')
268
+ time.sleep(backoff.current_backoff())
269
+ raise last_exc
177
270
 
178
271
  return wrapper
179
272
 
@@ -416,6 +509,10 @@ class ManagedJobScheduleState(enum.Enum):
416
509
  # This job may have been created before scheduler was introduced in #4458.
417
510
  # This state is not used by scheduler but just for backward compatibility.
418
511
  # TODO(cooperc): remove this in v0.11.0
512
+ # TODO(luca): the only states we need are INACTIVE, WAITING, ALIVE, and
513
+ # DONE. ALIVE = old LAUNCHING + ALIVE + ALIVE_BACKOFF + ALIVE_WAITING and
514
+ # will represent jobs that are claimed by a controller. Delete the rest
515
+ # in v0.13.0
419
516
  INVALID = None
420
517
  # The job should be ignored by the scheduler.
421
518
  INACTIVE = 'INACTIVE'
@@ -440,32 +537,6 @@ class ManagedJobScheduleState(enum.Enum):
440
537
 
441
538
 
442
539
  # === Status transition functions ===
443
- @_init_db
444
- def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
445
- pool: Optional[str], pool_hash: Optional[str]):
446
- assert _SQLALCHEMY_ENGINE is not None
447
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
448
- if (_SQLALCHEMY_ENGINE.dialect.name ==
449
- db_utils.SQLAlchemyDialect.SQLITE.value):
450
- insert_func = sqlite.insert
451
- elif (_SQLALCHEMY_ENGINE.dialect.name ==
452
- db_utils.SQLAlchemyDialect.POSTGRESQL.value):
453
- insert_func = postgresql.insert
454
- else:
455
- raise ValueError('Unsupported database dialect')
456
- insert_stmt = insert_func(job_info_table).values(
457
- spot_job_id=job_id,
458
- name=name,
459
- schedule_state=ManagedJobScheduleState.INACTIVE.value,
460
- workspace=workspace,
461
- entrypoint=entrypoint,
462
- pool=pool,
463
- pool_hash=pool_hash,
464
- )
465
- session.execute(insert_stmt)
466
- session.commit()
467
-
468
-
469
540
  @_init_db
470
541
  def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
471
542
  pool: Optional[str],
@@ -517,6 +588,7 @@ def set_pending(
517
588
  ):
518
589
  """Set the task to pending state."""
519
590
  assert _SQLALCHEMY_ENGINE is not None
591
+
520
592
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
521
593
  session.execute(
522
594
  sqlalchemy.insert(spot_table).values(
@@ -530,76 +602,28 @@ def set_pending(
530
602
  session.commit()
531
603
 
532
604
 
533
- @_init_db
534
- def set_starting(job_id: int, task_id: int, run_timestamp: str,
535
- submit_time: float, resources_str: str,
536
- specs: Dict[str, Union[str,
537
- int]], callback_func: CallbackType):
538
- """Set the task to starting state.
539
-
540
- Args:
541
- job_id: The managed job ID.
542
- task_id: The task ID.
543
- run_timestamp: The run_timestamp of the run. This will be used to
544
- determine the log directory of the managed task.
545
- submit_time: The time when the managed task is submitted.
546
- resources_str: The resources string of the managed task.
547
- specs: The specs of the managed task.
548
- callback_func: The callback function.
549
- """
550
- assert _SQLALCHEMY_ENGINE is not None
551
- # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
552
- # the log directory and submission time align with each other, so as to
553
- # make it easier to find them based on one of the values.
554
- # Also, using the earlier timestamp should be closer to the term
555
- # `submit_at`, which represents the time the managed task is submitted.
556
- logger.info('Launching the spot cluster...')
557
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
558
- count = session.query(spot_table).filter(
559
- sqlalchemy.and_(
560
- spot_table.c.spot_job_id == job_id,
561
- spot_table.c.task_id == task_id,
562
- spot_table.c.status == ManagedJobStatus.PENDING.value,
563
- spot_table.c.end_at.is_(None),
564
- )).update({
565
- spot_table.c.resources: resources_str,
566
- spot_table.c.submitted_at: submit_time,
567
- spot_table.c.status: ManagedJobStatus.STARTING.value,
568
- spot_table.c.run_timestamp: run_timestamp,
569
- spot_table.c.specs: json.dumps(specs),
570
- })
571
- session.commit()
572
- if count != 1:
573
- raise exceptions.ManagedJobStatusError(
574
- 'Failed to set the task to starting. '
575
- f'({count} rows updated)')
576
- # SUBMITTED is no longer used, but we keep it for backward compatibility.
577
- # TODO(cooperc): remove this in v0.12.0
578
- callback_func('SUBMITTED')
579
- callback_func('STARTING')
580
-
581
-
582
- @_init_db
583
- def set_backoff_pending(job_id: int, task_id: int):
605
+ @_init_db_async
606
+ async def set_backoff_pending_async(job_id: int, task_id: int):
584
607
  """Set the task to PENDING state if it is in backoff.
585
608
 
586
609
  This should only be used to transition from STARTING or RECOVERING back to
587
610
  PENDING.
588
611
  """
589
- assert _SQLALCHEMY_ENGINE is not None
590
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
591
- count = session.query(spot_table).filter(
592
- sqlalchemy.and_(
593
- spot_table.c.spot_job_id == job_id,
594
- spot_table.c.task_id == task_id,
595
- spot_table.c.status.in_([
596
- ManagedJobStatus.STARTING.value,
597
- ManagedJobStatus.RECOVERING.value
598
- ]),
599
- spot_table.c.end_at.is_(None),
600
- )).update({spot_table.c.status: ManagedJobStatus.PENDING.value})
601
- session.commit()
602
- logger.debug('back to PENDING')
612
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
613
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
614
+ count = await session.execute(
615
+ sqlalchemy.update(spot_table).where(
616
+ sqlalchemy.and_(
617
+ spot_table.c.spot_job_id == job_id,
618
+ spot_table.c.task_id == task_id,
619
+ spot_table.c.status.in_([
620
+ ManagedJobStatus.STARTING.value,
621
+ ManagedJobStatus.RECOVERING.value
622
+ ]),
623
+ spot_table.c.end_at.is_(None),
624
+ )).values({spot_table.c.status: ManagedJobStatus.PENDING.value})
625
+ )
626
+ await session.commit()
603
627
  if count != 1:
604
628
  raise exceptions.ManagedJobStatusError(
605
629
  'Failed to set the task back to pending. '
@@ -608,7 +632,7 @@ def set_backoff_pending(job_id: int, task_id: int):
608
632
 
609
633
 
610
634
  @_init_db
611
- def set_restarting(job_id: int, task_id: int, recovering: bool):
635
+ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
612
636
  """Set the task back to STARTING or RECOVERING from PENDING.
613
637
 
614
638
  This should not be used for the initial transition from PENDING to STARTING.
@@ -616,19 +640,20 @@ def set_restarting(job_id: int, task_id: int, recovering: bool):
616
640
  after using set_backoff_pending to transition back to PENDING during
617
641
  launch retry backoff.
618
642
  """
619
- assert _SQLALCHEMY_ENGINE is not None
643
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
620
644
  target_status = ManagedJobStatus.STARTING.value
621
645
  if recovering:
622
646
  target_status = ManagedJobStatus.RECOVERING.value
623
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
624
- count = session.query(spot_table).filter(
625
- sqlalchemy.and_(
626
- spot_table.c.spot_job_id == job_id,
627
- spot_table.c.task_id == task_id,
628
- spot_table.c.status == ManagedJobStatus.PENDING.value,
629
- spot_table.c.end_at.is_(None),
630
- )).update({spot_table.c.status: target_status})
631
- session.commit()
647
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
648
+ result = await session.execute(
649
+ sqlalchemy.update(spot_table).where(
650
+ sqlalchemy.and_(
651
+ spot_table.c.spot_job_id == job_id,
652
+ spot_table.c.task_id == task_id,
653
+ spot_table.c.end_at.is_(None),
654
+ )).values({spot_table.c.status: target_status}))
655
+ count = result.rowcount
656
+ await session.commit()
632
657
  logger.debug(f'back to {target_status}')
633
658
  if count != 1:
634
659
  raise exceptions.ManagedJobStatusError(
@@ -638,137 +663,6 @@ def set_restarting(job_id: int, task_id: int, recovering: bool):
638
663
  # initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
639
664
 
640
665
 
641
- @_init_db
642
- def set_started(job_id: int, task_id: int, start_time: float,
643
- callback_func: CallbackType):
644
- """Set the task to started state."""
645
- assert _SQLALCHEMY_ENGINE is not None
646
- logger.info('Job started.')
647
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
648
- count = session.query(spot_table).filter(
649
- sqlalchemy.and_(
650
- spot_table.c.spot_job_id == job_id,
651
- spot_table.c.task_id == task_id,
652
- spot_table.c.status.in_([
653
- ManagedJobStatus.STARTING.value,
654
- # If the task is empty, we will jump straight
655
- # from PENDING to RUNNING
656
- ManagedJobStatus.PENDING.value
657
- ]),
658
- spot_table.c.end_at.is_(None),
659
- )).update({
660
- spot_table.c.status: ManagedJobStatus.RUNNING.value,
661
- spot_table.c.start_at: start_time,
662
- spot_table.c.last_recovered_at: start_time,
663
- })
664
- session.commit()
665
- if count != 1:
666
- raise exceptions.ManagedJobStatusError(
667
- f'Failed to set the task to started. '
668
- f'({count} rows updated)')
669
- callback_func('STARTED')
670
-
671
-
672
- @_init_db
673
- def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
674
- callback_func: CallbackType):
675
- """Set the task to recovering state, and update the job duration."""
676
- assert _SQLALCHEMY_ENGINE is not None
677
- logger.info('=== Recovering... ===')
678
- # NOTE: if we are resuming from a controller failure and the previous status
679
- # is STARTING, the initial value of `last_recovered_at` might not be set
680
- # yet (default value -1). In this case, we should not add current timestamp.
681
- # Otherwise, the job duration will be incorrect (~55 years from 1970).
682
- current_time = time.time()
683
-
684
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
685
- if force_transit_to_recovering:
686
- # For the HA job controller, it is possible that the jobs came from
687
- # any processing status to recovering. But it should not be any
688
- # terminal status as such jobs will not be recovered; and it should
689
- # not be CANCELLING as we will directly trigger a cleanup.
690
- status_condition = spot_table.c.status.in_(
691
- [s.value for s in ManagedJobStatus.processing_statuses()])
692
- else:
693
- status_condition = (
694
- spot_table.c.status == ManagedJobStatus.RUNNING.value)
695
-
696
- count = session.query(spot_table).filter(
697
- sqlalchemy.and_(
698
- spot_table.c.spot_job_id == job_id,
699
- spot_table.c.task_id == task_id,
700
- status_condition,
701
- spot_table.c.end_at.is_(None),
702
- )).update({
703
- spot_table.c.status: ManagedJobStatus.RECOVERING.value,
704
- spot_table.c.job_duration: sqlalchemy.case(
705
- (spot_table.c.last_recovered_at >= 0,
706
- spot_table.c.job_duration + current_time -
707
- spot_table.c.last_recovered_at),
708
- else_=spot_table.c.job_duration),
709
- spot_table.c.last_recovered_at: sqlalchemy.case(
710
- (spot_table.c.last_recovered_at < 0, current_time),
711
- else_=spot_table.c.last_recovered_at),
712
- })
713
- session.commit()
714
- if count != 1:
715
- raise exceptions.ManagedJobStatusError(
716
- f'Failed to set the task to recovering. '
717
- f'({count} rows updated)')
718
- callback_func('RECOVERING')
719
-
720
-
721
- @_init_db
722
- def set_recovered(job_id: int, task_id: int, recovered_time: float,
723
- callback_func: CallbackType):
724
- """Set the task to recovered."""
725
- assert _SQLALCHEMY_ENGINE is not None
726
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
727
- count = session.query(spot_table).filter(
728
- sqlalchemy.and_(
729
- spot_table.c.spot_job_id == job_id,
730
- spot_table.c.task_id == task_id,
731
- spot_table.c.status == ManagedJobStatus.RECOVERING.value,
732
- spot_table.c.end_at.is_(None),
733
- )).update({
734
- spot_table.c.status: ManagedJobStatus.RUNNING.value,
735
- spot_table.c.last_recovered_at: recovered_time,
736
- spot_table.c.recovery_count: spot_table.c.recovery_count + 1,
737
- })
738
- session.commit()
739
- if count != 1:
740
- raise exceptions.ManagedJobStatusError(
741
- f'Failed to set the task to recovered. '
742
- f'({count} rows updated)')
743
- logger.info('==== Recovered. ====')
744
- callback_func('RECOVERED')
745
-
746
-
747
- @_init_db
748
- def set_succeeded(job_id: int, task_id: int, end_time: float,
749
- callback_func: CallbackType):
750
- """Set the task to succeeded, if it is in a non-terminal state."""
751
- assert _SQLALCHEMY_ENGINE is not None
752
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
753
- count = session.query(spot_table).filter(
754
- sqlalchemy.and_(
755
- spot_table.c.spot_job_id == job_id,
756
- spot_table.c.task_id == task_id,
757
- spot_table.c.status == ManagedJobStatus.RUNNING.value,
758
- spot_table.c.end_at.is_(None),
759
- )).update({
760
- spot_table.c.status: ManagedJobStatus.SUCCEEDED.value,
761
- spot_table.c.end_at: end_time,
762
- })
763
- session.commit()
764
- if count != 1:
765
- raise exceptions.ManagedJobStatusError(
766
- f'Failed to set the task to succeeded. '
767
- f'({count} rows updated)')
768
- callback_func('SUCCEEDED')
769
- logger.info('Job succeeded.')
770
-
771
-
772
666
  @_init_db
773
667
  def set_failed(
774
668
  job_id: int,
@@ -834,51 +728,30 @@ def set_failed(
834
728
 
835
729
 
836
730
  @_init_db
837
- def set_cancelling(job_id: int, callback_func: CallbackType):
838
- """Set tasks in the job as cancelling, if they are in non-terminal states.
839
-
840
- task_id is not needed, because we expect the job should be cancelled
841
- as a whole, and we should not cancel a single task.
842
- """
731
+ def set_pending_cancelled(job_id: int):
732
+ """Set the job as pending cancelled, if it is in non-terminal states."""
843
733
  assert _SQLALCHEMY_ENGINE is not None
844
734
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
845
- count = session.query(spot_table).filter(
846
- sqlalchemy.and_(
735
+ # Subquery to get the spot_job_ids that match the joined condition
736
+ subquery = session.query(spot_table.c.job_id).join(
737
+ job_info_table,
738
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id).filter(
847
739
  spot_table.c.spot_job_id == job_id,
848
- spot_table.c.end_at.is_(None),
849
- )).update({spot_table.c.status: ManagedJobStatus.CANCELLING.value})
850
- session.commit()
851
- updated = count > 0
852
- if updated:
853
- logger.info('Cancelling the job...')
854
- callback_func('CANCELLING')
855
- else:
856
- logger.info('Cancellation skipped, job is already terminal')
857
-
858
-
859
- @_init_db
860
- def set_cancelled(job_id: int, callback_func: CallbackType):
861
- """Set tasks in the job as cancelled, if they are in CANCELLING state.
740
+ spot_table.c.status == ManagedJobStatus.PENDING.value,
741
+ sqlalchemy.or_(
742
+ job_info_table.c.schedule_state ==
743
+ ManagedJobScheduleState.WAITING.value,
744
+ job_info_table.c.schedule_state ==
745
+ ManagedJobScheduleState.INACTIVE.value,
746
+ ),
747
+ ).subquery()
862
748
 
863
- The set_cancelling should be called before this function.
864
- """
865
- assert _SQLALCHEMY_ENGINE is not None
866
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
867
749
  count = session.query(spot_table).filter(
868
- sqlalchemy.and_(
869
- spot_table.c.spot_job_id == job_id,
870
- spot_table.c.status == ManagedJobStatus.CANCELLING.value,
871
- )).update({
872
- spot_table.c.status: ManagedJobStatus.CANCELLED.value,
873
- spot_table.c.end_at: time.time(),
874
- })
750
+ spot_table.c.job_id.in_(subquery)).update(
751
+ {spot_table.c.status: ManagedJobStatus.CANCELLED.value},
752
+ synchronize_session=False)
875
753
  session.commit()
876
- updated = count > 0
877
- if updated:
878
- logger.info('Job cancelled.')
879
- callback_func('CANCELLED')
880
- else:
881
- logger.info('Cancellation skipped, job is not CANCELLING')
754
+ return count > 0
882
755
 
883
756
 
884
757
  @_init_db
@@ -936,45 +809,6 @@ def get_nonterminal_job_ids_by_name(name: Optional[str],
936
809
  return job_ids
937
810
 
938
811
 
939
- @_init_db
940
- def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
941
- """Get jobs from the database that have a live schedule_state.
942
-
943
- This should return job(s) that are not INACTIVE, WAITING, or DONE. So a
944
- returned job should correspond to a live job controller process, with one
945
- exception: the job may have just transitioned from WAITING to LAUNCHING, but
946
- the controller process has not yet started.
947
- """
948
- assert _SQLALCHEMY_ENGINE is not None
949
-
950
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
951
- query = sqlalchemy.select(
952
- job_info_table.c.spot_job_id,
953
- job_info_table.c.schedule_state,
954
- job_info_table.c.controller_pid,
955
- ).where(~job_info_table.c.schedule_state.in_([
956
- ManagedJobScheduleState.INACTIVE.value,
957
- ManagedJobScheduleState.WAITING.value,
958
- ManagedJobScheduleState.DONE.value,
959
- ]))
960
-
961
- if job_id is not None:
962
- query = query.where(job_info_table.c.spot_job_id == job_id)
963
-
964
- query = query.order_by(job_info_table.c.spot_job_id.desc())
965
-
966
- rows = session.execute(query).fetchall()
967
- jobs = []
968
- for row in rows:
969
- job_dict = {
970
- 'job_id': row[0],
971
- 'schedule_state': ManagedJobScheduleState(row[1]),
972
- 'controller_pid': row[2],
973
- }
974
- jobs.append(job_dict)
975
- return jobs
976
-
977
-
978
812
  @_init_db
979
813
  def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
980
814
  """Get jobs that need controller process checking.
@@ -1035,32 +869,6 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
1035
869
  return [row[0] for row in rows if row[0] is not None]
1036
870
 
1037
871
 
1038
- @_init_db
1039
- def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
1040
- """Get all job ids by name."""
1041
- assert _SQLALCHEMY_ENGINE is not None
1042
-
1043
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1044
- query = sqlalchemy.select(
1045
- spot_table.c.spot_job_id.distinct()).select_from(
1046
- spot_table.outerjoin(
1047
- job_info_table,
1048
- spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1049
- if name is not None:
1050
- # We match the job name from `job_info` for the jobs submitted after
1051
- # #1982, and from `spot` for the jobs submitted before #1982, whose
1052
- # job_info is not available.
1053
- name_condition = sqlalchemy.or_(
1054
- job_info_table.c.name == name,
1055
- sqlalchemy.and_(job_info_table.c.name.is_(None),
1056
- spot_table.c.task_name == name))
1057
- query = query.where(name_condition)
1058
- query = query.order_by(spot_table.c.spot_job_id.desc())
1059
- rows = session.execute(query).fetchall()
1060
- job_ids = [row[0] for row in rows if row[0] is not None]
1061
- return job_ids
1062
-
1063
-
1064
872
  @_init_db
1065
873
  def _get_all_task_ids_statuses(
1066
874
  job_id: int) -> List[Tuple[int, ManagedJobStatus]]:
@@ -1092,18 +900,6 @@ def get_all_task_ids_names_statuses_logs(
1092
900
  for row in id_names]
1093
901
 
1094
902
 
1095
- @_init_db
1096
- def get_job_status_with_task_id(job_id: int,
1097
- task_id: int) -> Optional[ManagedJobStatus]:
1098
- assert _SQLALCHEMY_ENGINE is not None
1099
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1100
- status = session.execute(
1101
- sqlalchemy.select(spot_table.c.status).where(
1102
- sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
1103
- spot_table.c.task_id == task_id))).fetchone()
1104
- return ManagedJobStatus(status[0]) if status else None
1105
-
1106
-
1107
903
  def get_num_tasks(job_id: int) -> int:
1108
904
  return len(_get_all_task_ids_statuses(job_id))
1109
905
 
@@ -1131,6 +927,16 @@ def get_latest_task_id_status(
1131
927
  return task_id, status
1132
928
 
1133
929
 
930
+ @_init_db
931
+ def get_job_controller_pid(job_id: int) -> Optional[int]:
932
+ assert _SQLALCHEMY_ENGINE is not None
933
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
934
+ pid = session.execute(
935
+ sqlalchemy.select(job_info_table.c.controller_pid).where(
936
+ job_info_table.c.spot_job_id == job_id)).fetchone()
937
+ return pid[0] if pid else None
938
+
939
+
1134
940
  def get_status(job_id: int) -> Optional[ManagedJobStatus]:
1135
941
  _, status = get_latest_task_id_status(job_id)
1136
942
  return status
@@ -1243,30 +1049,10 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
1243
1049
 
1244
1050
 
1245
1051
  @_init_db
1246
- def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
1247
- """Get the local log directory for a job."""
1248
- assert _SQLALCHEMY_ENGINE is not None
1249
-
1250
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1251
- where_conditions = [spot_table.c.spot_job_id == job_id]
1252
- if task_id is not None:
1253
- where_conditions.append(spot_table.c.task_id == task_id)
1254
- local_log_file = session.execute(
1255
- sqlalchemy.select(spot_table.c.local_log_file).where(
1256
- sqlalchemy.and_(*where_conditions))).fetchone()
1257
- return local_log_file[-1] if local_log_file else None
1258
-
1259
-
1260
- # === Scheduler state functions ===
1261
- # Only the scheduler should call these functions. They may require holding the
1262
- # scheduler lock to work correctly.
1263
-
1264
-
1265
- @_init_db
1266
- def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1267
- original_user_yaml_path: str, env_file_path: str,
1268
- user_hash: str, priority: int) -> bool:
1269
- """Do not call without holding the scheduler lock.
1052
+ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1053
+ original_user_yaml_path: str, env_file_path: str,
1054
+ user_hash: str, priority: int):
1055
+ """Do not call without holding the scheduler lock.
1270
1056
 
1271
1057
  Returns: Whether this is a recovery run or not.
1272
1058
  If this is a recovery run, the job may already be in the WAITING
@@ -1277,11 +1063,7 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1277
1063
  assert _SQLALCHEMY_ENGINE is not None
1278
1064
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1279
1065
  updated_count = session.query(job_info_table).filter(
1280
- sqlalchemy.and_(
1281
- job_info_table.c.spot_job_id == job_id,
1282
- job_info_table.c.schedule_state ==
1283
- ManagedJobScheduleState.INACTIVE.value,
1284
- )
1066
+ sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
1285
1067
  ).update({
1286
1068
  job_info_table.c.schedule_state:
1287
1069
  ManagedJobScheduleState.WAITING.value,
@@ -1292,9 +1074,7 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1292
1074
  job_info_table.c.priority: priority,
1293
1075
  })
1294
1076
  session.commit()
1295
- # For a recovery run, the job may already be in the WAITING state.
1296
1077
  assert updated_count <= 1, (job_id, updated_count)
1297
- return updated_count == 0
1298
1078
 
1299
1079
 
1300
1080
  @_init_db
@@ -1319,17 +1099,15 @@ def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
1319
1099
  session.commit()
1320
1100
 
1321
1101
 
1322
- @_init_db
1323
- def set_job_id_on_pool_cluster(job_id: int,
1324
- job_id_on_pool_cluster: int) -> None:
1102
+ @_init_db_async
1103
+ async def set_job_id_on_pool_cluster_async(job_id: int,
1104
+ job_id_on_pool_cluster: int) -> None:
1325
1105
  """Set the job id on the pool cluster for a job."""
1326
- assert _SQLALCHEMY_ENGINE is not None
1327
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1328
- session.query(job_info_table).filter(
1329
- job_info_table.c.spot_job_id == job_id).update({
1330
- job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster
1331
- })
1332
- session.commit()
1106
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1107
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1108
+ await session.execute(job_info_table.c.spot_job_id == job_id).update(
1109
+ {job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster})
1110
+ await session.commit()
1333
1111
 
1334
1112
 
1335
1113
  @_init_db
@@ -1347,77 +1125,54 @@ def get_pool_submit_info(job_id: int) -> Tuple[Optional[str], Optional[int]]:
1347
1125
  return info[0], info[1]
1348
1126
 
1349
1127
 
1350
- @_init_db
1351
- def scheduler_set_launching(job_id: int,
1352
- current_state: ManagedJobScheduleState) -> None:
1353
- """Do not call without holding the scheduler lock."""
1354
- assert _SQLALCHEMY_ENGINE is not None
1355
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1356
- updated_count = session.query(job_info_table).filter(
1357
- sqlalchemy.and_(
1358
- job_info_table.c.spot_job_id == job_id,
1359
- job_info_table.c.schedule_state == current_state.value,
1360
- )).update({
1361
- job_info_table.c.schedule_state:
1362
- ManagedJobScheduleState.LAUNCHING.value
1363
- })
1364
- session.commit()
1365
- assert updated_count == 1, (job_id, updated_count)
1366
-
1367
-
1368
- @_init_db
1369
- def scheduler_set_alive(job_id: int) -> None:
1370
- """Do not call without holding the scheduler lock."""
1371
- assert _SQLALCHEMY_ENGINE is not None
1372
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1373
- updated_count = session.query(job_info_table).filter(
1374
- sqlalchemy.and_(
1375
- job_info_table.c.spot_job_id == job_id,
1376
- job_info_table.c.schedule_state ==
1377
- ManagedJobScheduleState.LAUNCHING.value,
1378
- )).update({
1379
- job_info_table.c.schedule_state:
1380
- ManagedJobScheduleState.ALIVE.value
1381
- })
1382
- session.commit()
1383
- assert updated_count == 1, (job_id, updated_count)
1128
+ @_init_db_async
1129
+ async def get_pool_submit_info_async(
1130
+ job_id: int) -> Tuple[Optional[str], Optional[int]]:
1131
+ """Get the cluster name and job id on the pool from the managed job id."""
1132
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1133
+ async with orm.Session(_SQLALCHEMY_ENGINE_ASYNC) as session:
1134
+ info = await session.execute(
1135
+ sqlalchemy.select(job_info_table.c.current_cluster_name,
1136
+ job_info_table.c.job_id_on_pool_cluster).where(
1137
+ job_info_table.c.spot_job_id == job_id)
1138
+ ).fetchone()
1139
+ if info is None:
1140
+ return None, None
1141
+ return info[0], info[1]
1384
1142
 
1385
1143
 
1386
- @_init_db
1387
- def scheduler_set_alive_backoff(job_id: int) -> None:
1388
- """Do not call without holding the scheduler lock."""
1389
- assert _SQLALCHEMY_ENGINE is not None
1390
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1391
- updated_count = session.query(job_info_table).filter(
1392
- sqlalchemy.and_(
1393
- job_info_table.c.spot_job_id == job_id,
1394
- job_info_table.c.schedule_state ==
1395
- ManagedJobScheduleState.LAUNCHING.value,
1396
- )).update({
1397
- job_info_table.c.schedule_state:
1398
- ManagedJobScheduleState.ALIVE_BACKOFF.value
1399
- })
1400
- session.commit()
1401
- assert updated_count == 1, (job_id, updated_count)
1144
+ @_init_db_async
1145
+ async def scheduler_set_launching_async(job_id: int):
1146
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1147
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1148
+ await session.execute(
1149
+ sqlalchemy.update(job_info_table).where(
1150
+ sqlalchemy.and_(job_info_table.c.spot_job_id == job_id)).values(
1151
+ {
1152
+ job_info_table.c.schedule_state:
1153
+ ManagedJobScheduleState.LAUNCHING.value
1154
+ }))
1155
+ await session.commit()
1402
1156
 
1403
1157
 
1404
- @_init_db
1405
- def scheduler_set_alive_waiting(job_id: int) -> None:
1158
+ @_init_db_async
1159
+ async def scheduler_set_alive_async(job_id: int) -> None:
1406
1160
  """Do not call without holding the scheduler lock."""
1407
- assert _SQLALCHEMY_ENGINE is not None
1408
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1409
- updated_count = session.query(job_info_table).filter(
1410
- sqlalchemy.and_(
1411
- job_info_table.c.spot_job_id == job_id,
1412
- job_info_table.c.schedule_state.in_([
1413
- ManagedJobScheduleState.ALIVE.value,
1414
- ManagedJobScheduleState.ALIVE_BACKOFF.value,
1415
- ]))).update({
1161
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1162
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1163
+ result = await session.execute(
1164
+ sqlalchemy.update(job_info_table).where(
1165
+ sqlalchemy.and_(
1166
+ job_info_table.c.spot_job_id == job_id,
1167
+ job_info_table.c.schedule_state ==
1168
+ ManagedJobScheduleState.LAUNCHING.value,
1169
+ )).values({
1416
1170
  job_info_table.c.schedule_state:
1417
- ManagedJobScheduleState.ALIVE_WAITING.value
1418
- })
1419
- session.commit()
1420
- assert updated_count == 1, (job_id, updated_count)
1171
+ ManagedJobScheduleState.ALIVE.value
1172
+ }))
1173
+ changes = result.rowcount
1174
+ await session.commit()
1175
+ assert changes == 1, (job_id, changes)
1421
1176
 
1422
1177
 
1423
1178
  @_init_db
@@ -1439,16 +1194,6 @@ def scheduler_set_done(job_id: int, idempotent: bool = False) -> None:
1439
1194
  assert updated_count == 1, (job_id, updated_count)
1440
1195
 
1441
1196
 
1442
- @_init_db
1443
- def set_job_controller_pid(job_id: int, pid: int):
1444
- assert _SQLALCHEMY_ENGINE is not None
1445
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1446
- updated_count = session.query(job_info_table).filter_by(
1447
- spot_job_id=job_id).update({job_info_table.c.controller_pid: pid})
1448
- session.commit()
1449
- assert updated_count == 1, (job_id, updated_count)
1450
-
1451
-
1452
1197
  @_init_db
1453
1198
  def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState:
1454
1199
  assert _SQLALCHEMY_ENGINE is not None
@@ -1527,58 +1272,78 @@ def get_nonterminal_job_ids_by_pool(pool: str,
1527
1272
  return job_ids
1528
1273
 
1529
1274
 
1530
- @_init_db
1531
- def get_waiting_job() -> Optional[Dict[str, Any]]:
1275
+ @_init_db_async
1276
+ async def get_waiting_job_async(pid: int) -> Optional[Dict[str, Any]]:
1532
1277
  """Get the next job that should transition to LAUNCHING.
1533
1278
 
1534
- Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
1535
- priority is greater than or equal to any currently LAUNCHING or
1536
- ALIVE_BACKOFF job.
1279
+ Selects the highest-priority WAITING or ALIVE_WAITING job and atomically
1280
+ transitions it to LAUNCHING state to prevent race conditions.
1281
+
1282
+ Returns the job information if a job was successfully transitioned to
1283
+ LAUNCHING, or None if no suitable job was found.
1537
1284
 
1538
1285
  Backwards compatibility note: jobs submitted before #4485 will have no
1539
1286
  schedule_state and will be ignored by this SQL query.
1540
1287
  """
1541
- assert _SQLALCHEMY_ENGINE is not None
1542
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1543
- # Get the highest-priority WAITING or ALIVE_WAITING job whose priority
1544
- # is greater than or equal to the highest priority LAUNCHING or
1545
- # ALIVE_BACKOFF job's priority.
1546
- # First, get the max priority of LAUNCHING or ALIVE_BACKOFF jobs
1547
- max_priority_subquery = sqlalchemy.select(
1548
- sqlalchemy.func.max(job_info_table.c.priority)).where(
1549
- job_info_table.c.schedule_state.in_([
1550
- ManagedJobScheduleState.LAUNCHING.value,
1551
- ManagedJobScheduleState.ALIVE_BACKOFF.value,
1552
- ])).scalar_subquery()
1553
- # Main query for waiting jobs
1554
- select_conds = [
1555
- job_info_table.c.schedule_state.in_([
1556
- ManagedJobScheduleState.WAITING.value,
1557
- ManagedJobScheduleState.ALIVE_WAITING.value,
1558
- ]),
1559
- job_info_table.c.priority >= sqlalchemy.func.coalesce(
1560
- max_priority_subquery, 0),
1561
- ]
1562
- query = sqlalchemy.select(
1288
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1289
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1290
+ # Select the highest priority waiting job for update (locks the row)
1291
+ select_query = sqlalchemy.select(
1563
1292
  job_info_table.c.spot_job_id,
1564
1293
  job_info_table.c.schedule_state,
1565
1294
  job_info_table.c.dag_yaml_path,
1566
1295
  job_info_table.c.env_file_path,
1296
+ job_info_table.c.controller_pid,
1567
1297
  job_info_table.c.pool,
1568
- ).where(sqlalchemy.and_(*select_conds)).order_by(
1569
- job_info_table.c.priority.desc(),
1570
- job_info_table.c.spot_job_id.asc(),
1571
- ).limit(1)
1572
- waiting_job_row = session.execute(query).fetchone()
1298
+ ).where(
1299
+ job_info_table.c.schedule_state.in_([
1300
+ ManagedJobScheduleState.WAITING.value,
1301
+ ])).order_by(
1302
+ job_info_table.c.priority.desc(),
1303
+ job_info_table.c.spot_job_id.asc(),
1304
+ ).limit(1).with_for_update()
1305
+
1306
+ # Execute the select with row locking
1307
+ result = await session.execute(select_query)
1308
+ waiting_job_row = result.fetchone()
1309
+
1573
1310
  if waiting_job_row is None:
1574
1311
  return None
1575
1312
 
1313
+ job_id = waiting_job_row[0]
1314
+ current_state = ManagedJobScheduleState(waiting_job_row[1])
1315
+ dag_yaml_path = waiting_job_row[2]
1316
+ env_file_path = waiting_job_row[3]
1317
+ controller_pid = waiting_job_row[4]
1318
+ pool = waiting_job_row[5]
1319
+
1320
+ # Update the job state to LAUNCHING
1321
+ update_result = await session.execute(
1322
+ sqlalchemy.update(job_info_table).where(
1323
+ sqlalchemy.and_(
1324
+ job_info_table.c.spot_job_id == job_id,
1325
+ job_info_table.c.schedule_state == current_state.value,
1326
+ )).values({
1327
+ job_info_table.c.schedule_state:
1328
+ ManagedJobScheduleState.LAUNCHING.value,
1329
+ job_info_table.c.controller_pid: pid,
1330
+ }))
1331
+
1332
+ if update_result.rowcount != 1:
1333
+ # Update failed, rollback and return None
1334
+ await session.rollback()
1335
+ return None
1336
+
1337
+ # Commit the transaction
1338
+ await session.commit()
1339
+
1576
1340
  return {
1577
- 'job_id': waiting_job_row[0],
1578
- 'schedule_state': ManagedJobScheduleState(waiting_job_row[1]),
1579
- 'dag_yaml_path': waiting_job_row[2],
1580
- 'env_file_path': waiting_job_row[3],
1581
- 'pool': waiting_job_row[4],
1341
+ 'job_id': job_id,
1342
+ 'schedule_state': current_state,
1343
+ 'dag_yaml_path': dag_yaml_path,
1344
+ 'env_file_path': env_file_path,
1345
+ 'old_pid': controller_pid,
1346
+ 'pool': pool,
1582
1347
  }
1583
1348
 
1584
1349
 
@@ -1641,3 +1406,429 @@ def remove_ha_recovery_script(job_id: int) -> None:
1641
1406
  session.query(ha_recovery_script_table).filter_by(
1642
1407
  job_id=job_id).delete()
1643
1408
  session.commit()
1409
+
1410
+
1411
+ @_init_db_async
1412
+ async def get_latest_task_id_status_async(
1413
+ job_id: int) -> Union[Tuple[int, ManagedJobStatus], Tuple[None, None]]:
1414
+ """Returns the (task id, status) of the latest task of a job."""
1415
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1416
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1417
+ result = await session.execute(
1418
+ sqlalchemy.select(
1419
+ spot_table.c.task_id,
1420
+ spot_table.c.status,
1421
+ ).where(spot_table.c.spot_job_id == job_id).order_by(
1422
+ spot_table.c.task_id.asc()))
1423
+ id_statuses = [
1424
+ (row[0], ManagedJobStatus(row[1])) for row in result.fetchall()
1425
+ ]
1426
+
1427
+ if not id_statuses:
1428
+ return None, None
1429
+ task_id, status = next(
1430
+ ((tid, st) for tid, st in id_statuses if not st.is_terminal()),
1431
+ id_statuses[-1],
1432
+ )
1433
+ return task_id, status
1434
+
1435
+
1436
+ @_init_db_async
1437
+ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
1438
+ submit_time: float, resources_str: str,
1439
+ specs: Dict[str, Union[str, int]],
1440
+ callback_func: AsyncCallbackType):
1441
+ """Set the task to starting state."""
1442
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1443
+ logger.info('Launching the spot cluster...')
1444
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1445
+ result = await session.execute(
1446
+ sqlalchemy.update(spot_table).where(
1447
+ sqlalchemy.and_(
1448
+ spot_table.c.spot_job_id == job_id,
1449
+ spot_table.c.task_id == task_id,
1450
+ spot_table.c.status == ManagedJobStatus.PENDING.value,
1451
+ spot_table.c.end_at.is_(None),
1452
+ )).values({
1453
+ spot_table.c.resources: resources_str,
1454
+ spot_table.c.submitted_at: submit_time,
1455
+ spot_table.c.status: ManagedJobStatus.STARTING.value,
1456
+ spot_table.c.run_timestamp: run_timestamp,
1457
+ spot_table.c.specs: json.dumps(specs),
1458
+ }))
1459
+ count = result.rowcount
1460
+ await session.commit()
1461
+ if count != 1:
1462
+ raise exceptions.ManagedJobStatusError(
1463
+ 'Failed to set the task to starting. '
1464
+ f'({count} rows updated)')
1465
+ await callback_func('SUBMITTED')
1466
+ await callback_func('STARTING')
1467
+
1468
+
1469
+ @_init_db_async
1470
+ async def set_started_async(job_id: int, task_id: int, start_time: float,
1471
+ callback_func: AsyncCallbackType):
1472
+ """Set the task to started state."""
1473
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1474
+ logger.info('Job started.')
1475
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1476
+ result = await session.execute(
1477
+ sqlalchemy.update(spot_table).where(
1478
+ sqlalchemy.and_(
1479
+ spot_table.c.spot_job_id == job_id,
1480
+ spot_table.c.task_id == task_id,
1481
+ spot_table.c.status.in_([
1482
+ ManagedJobStatus.STARTING.value,
1483
+ ManagedJobStatus.PENDING.value
1484
+ ]),
1485
+ spot_table.c.end_at.is_(None),
1486
+ )).values({
1487
+ spot_table.c.status: ManagedJobStatus.RUNNING.value,
1488
+ spot_table.c.start_at: start_time,
1489
+ spot_table.c.last_recovered_at: start_time,
1490
+ }))
1491
+ count = result.rowcount
1492
+ await session.commit()
1493
+ if count != 1:
1494
+ raise exceptions.ManagedJobStatusError(
1495
+ f'Failed to set the task to started. '
1496
+ f'({count} rows updated)')
1497
+ await callback_func('STARTED')
1498
+
1499
+
1500
+ @_init_db_async
1501
+ async def get_job_status_with_task_id_async(
1502
+ job_id: int, task_id: int) -> Optional[ManagedJobStatus]:
1503
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1504
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1505
+ result = await session.execute(
1506
+ sqlalchemy.select(spot_table.c.status).where(
1507
+ sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
1508
+ spot_table.c.task_id == task_id)))
1509
+ status = result.fetchone()
1510
+ return ManagedJobStatus(status[0]) if status else None
1511
+
1512
+
1513
+ @_init_db_async
1514
+ async def set_recovering_async(job_id: int, task_id: int,
1515
+ force_transit_to_recovering: bool,
1516
+ callback_func: AsyncCallbackType):
1517
+ """Set the task to recovering state, and update the job duration."""
1518
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1519
+ logger.info('=== Recovering... ===')
1520
+ current_time = time.time()
1521
+
1522
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1523
+ if force_transit_to_recovering:
1524
+ status_condition = spot_table.c.status.in_(
1525
+ [s.value for s in ManagedJobStatus.processing_statuses()])
1526
+ else:
1527
+ status_condition = (
1528
+ spot_table.c.status == ManagedJobStatus.RUNNING.value)
1529
+
1530
+ result = await session.execute(
1531
+ sqlalchemy.update(spot_table).where(
1532
+ sqlalchemy.and_(
1533
+ spot_table.c.spot_job_id == job_id,
1534
+ spot_table.c.task_id == task_id,
1535
+ status_condition,
1536
+ spot_table.c.end_at.is_(None),
1537
+ )).values({
1538
+ spot_table.c.status: ManagedJobStatus.RECOVERING.value,
1539
+ spot_table.c.job_duration: sqlalchemy.case(
1540
+ (spot_table.c.last_recovered_at >= 0,
1541
+ spot_table.c.job_duration + current_time -
1542
+ spot_table.c.last_recovered_at),
1543
+ else_=spot_table.c.job_duration),
1544
+ spot_table.c.last_recovered_at: sqlalchemy.case(
1545
+ (spot_table.c.last_recovered_at < 0, current_time),
1546
+ else_=spot_table.c.last_recovered_at),
1547
+ }))
1548
+ count = result.rowcount
1549
+ await session.commit()
1550
+ if count != 1:
1551
+ raise exceptions.ManagedJobStatusError(
1552
+ f'Failed to set the task to recovering. '
1553
+ f'({count} rows updated)')
1554
+ await callback_func('RECOVERING')
1555
+
1556
+
1557
+ @_init_db_async
1558
+ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
1559
+ callback_func: AsyncCallbackType):
1560
+ """Set the task to recovered."""
1561
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1562
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1563
+ result = await session.execute(
1564
+ sqlalchemy.update(spot_table).where(
1565
+ sqlalchemy.and_(
1566
+ spot_table.c.spot_job_id == job_id,
1567
+ spot_table.c.task_id == task_id,
1568
+ spot_table.c.status == ManagedJobStatus.RECOVERING.value,
1569
+ spot_table.c.end_at.is_(None),
1570
+ )).values({
1571
+ spot_table.c.status: ManagedJobStatus.RUNNING.value,
1572
+ spot_table.c.last_recovered_at: recovered_time,
1573
+ spot_table.c.recovery_count: spot_table.c.recovery_count +
1574
+ 1,
1575
+ }))
1576
+ count = result.rowcount
1577
+ await session.commit()
1578
+ if count != 1:
1579
+ raise exceptions.ManagedJobStatusError(
1580
+ f'Failed to set the task to recovered. '
1581
+ f'({count} rows updated)')
1582
+ logger.info('==== Recovered. ====')
1583
+ await callback_func('RECOVERED')
1584
+
1585
+
1586
+ @_init_db_async
1587
+ async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
1588
+ callback_func: AsyncCallbackType):
1589
+ """Set the task to succeeded, if it is in a non-terminal state."""
1590
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1591
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1592
+ result = await session.execute(
1593
+ sqlalchemy.update(spot_table).where(
1594
+ sqlalchemy.and_(
1595
+ spot_table.c.spot_job_id == job_id,
1596
+ spot_table.c.task_id == task_id,
1597
+ spot_table.c.status == ManagedJobStatus.RUNNING.value,
1598
+ spot_table.c.end_at.is_(None),
1599
+ )).values({
1600
+ spot_table.c.status: ManagedJobStatus.SUCCEEDED.value,
1601
+ spot_table.c.end_at: end_time,
1602
+ }))
1603
+ count = result.rowcount
1604
+ await session.commit()
1605
+ if count != 1:
1606
+ raise exceptions.ManagedJobStatusError(
1607
+ f'Failed to set the task to succeeded. '
1608
+ f'({count} rows updated)')
1609
+ await callback_func('SUCCEEDED')
1610
+ logger.info('Job succeeded.')
1611
+
1612
+
1613
+ @_init_db_async
1614
+ async def set_failed_async(
1615
+ job_id: int,
1616
+ task_id: Optional[int],
1617
+ failure_type: ManagedJobStatus,
1618
+ failure_reason: str,
1619
+ callback_func: Optional[AsyncCallbackType] = None,
1620
+ end_time: Optional[float] = None,
1621
+ override_terminal: bool = False,
1622
+ ):
1623
+ """Set an entire job or task to failed."""
1624
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1625
+ assert failure_type.is_failed(), failure_type
1626
+ end_time = time.time() if end_time is None else end_time
1627
+
1628
+ fields_to_set: Dict[str, Any] = {
1629
+ spot_table.c.status: failure_type.value,
1630
+ spot_table.c.failure_reason: failure_reason,
1631
+ }
1632
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1633
+ # Get previous status
1634
+ result = await session.execute(
1635
+ sqlalchemy.select(
1636
+ spot_table.c.status).where(spot_table.c.spot_job_id == job_id))
1637
+ previous_status_row = result.fetchone()
1638
+ previous_status = ManagedJobStatus(previous_status_row[0])
1639
+ if previous_status == ManagedJobStatus.RECOVERING:
1640
+ fields_to_set[spot_table.c.last_recovered_at] = end_time
1641
+ where_conditions = [spot_table.c.spot_job_id == job_id]
1642
+ if task_id is not None:
1643
+ where_conditions.append(spot_table.c.task_id == task_id)
1644
+ if override_terminal:
1645
+ fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
1646
+ spot_table.c.end_at, end_time)
1647
+ else:
1648
+ fields_to_set[spot_table.c.end_at] = end_time
1649
+ where_conditions.append(spot_table.c.end_at.is_(None))
1650
+ result = await session.execute(
1651
+ sqlalchemy.update(spot_table).where(
1652
+ sqlalchemy.and_(*where_conditions)).values(fields_to_set))
1653
+ count = result.rowcount
1654
+ await session.commit()
1655
+ updated = count > 0
1656
+ if callback_func and updated:
1657
+ await callback_func('FAILED')
1658
+ logger.info(failure_reason)
1659
+
1660
+
1661
+ @_init_db_async
1662
+ async def set_cancelling_async(job_id: int, callback_func: AsyncCallbackType):
1663
+ """Set tasks in the job as cancelling, if they are in non-terminal
1664
+ states."""
1665
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1666
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1667
+ result = await session.execute(
1668
+ sqlalchemy.update(spot_table).where(
1669
+ sqlalchemy.and_(
1670
+ spot_table.c.spot_job_id == job_id,
1671
+ spot_table.c.end_at.is_(None),
1672
+ )).values(
1673
+ {spot_table.c.status: ManagedJobStatus.CANCELLING.value}))
1674
+ count = result.rowcount
1675
+ await session.commit()
1676
+ updated = count > 0
1677
+ if updated:
1678
+ logger.info('Cancelling the job...')
1679
+ await callback_func('CANCELLING')
1680
+ else:
1681
+ logger.info('Cancellation skipped, job is already terminal')
1682
+
1683
+
1684
+ @_init_db_async
1685
+ async def set_cancelled_async(job_id: int, callback_func: AsyncCallbackType):
1686
+ """Set tasks in the job as cancelled, if they are in CANCELLING state."""
1687
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1688
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1689
+ result = await session.execute(
1690
+ sqlalchemy.update(spot_table).where(
1691
+ sqlalchemy.and_(
1692
+ spot_table.c.spot_job_id == job_id,
1693
+ spot_table.c.status == ManagedJobStatus.CANCELLING.value,
1694
+ )).values({
1695
+ spot_table.c.status: ManagedJobStatus.CANCELLED.value,
1696
+ spot_table.c.end_at: time.time(),
1697
+ }))
1698
+ count = result.rowcount
1699
+ await session.commit()
1700
+ updated = count > 0
1701
+ if updated:
1702
+ logger.info('Job cancelled.')
1703
+ await callback_func('CANCELLED')
1704
+ else:
1705
+ logger.info('Cancellation skipped, job is not CANCELLING')
1706
+
1707
+
1708
+ @_init_db_async
1709
+ async def remove_ha_recovery_script_async(job_id: int) -> None:
1710
+ """Remove the HA recovery script for a job."""
1711
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1712
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1713
+ await session.execute(
1714
+ sqlalchemy.delete(ha_recovery_script_table).where(
1715
+ ha_recovery_script_table.c.job_id == job_id))
1716
+ await session.commit()
1717
+
1718
+
1719
+ async def get_status_async(job_id: int) -> Optional[ManagedJobStatus]:
1720
+ _, status = await get_latest_task_id_status_async(job_id)
1721
+ return status
1722
+
1723
+
1724
+ @_init_db_async
1725
+ async def get_job_schedule_state_async(job_id: int) -> ManagedJobScheduleState:
1726
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1727
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1728
+ result = await session.execute(
1729
+ sqlalchemy.select(job_info_table.c.schedule_state).where(
1730
+ job_info_table.c.spot_job_id == job_id))
1731
+ state = result.fetchone()[0]
1732
+ return ManagedJobScheduleState(state)
1733
+
1734
+
1735
+ @_init_db_async
1736
+ async def scheduler_set_done_async(job_id: int,
1737
+ idempotent: bool = False) -> None:
1738
+ """Do not call without holding the scheduler lock."""
1739
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
1740
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
1741
+ result = await session.execute(
1742
+ sqlalchemy.update(job_info_table).where(
1743
+ sqlalchemy.and_(
1744
+ job_info_table.c.spot_job_id == job_id,
1745
+ job_info_table.c.schedule_state !=
1746
+ ManagedJobScheduleState.DONE.value,
1747
+ )).values({
1748
+ job_info_table.c.schedule_state:
1749
+ ManagedJobScheduleState.DONE.value
1750
+ }))
1751
+ updated_count = result.rowcount
1752
+ await session.commit()
1753
+ if not idempotent:
1754
+ assert updated_count == 1, (job_id, updated_count)
1755
+
1756
+
1757
+ # ==== needed for codegen ====
1758
+ # functions have no use outside of codegen, remove at your own peril
1759
+
1760
+
1761
+ @_init_db
1762
+ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
1763
+ pool: Optional[str], pool_hash: Optional[str]):
1764
+ assert _SQLALCHEMY_ENGINE is not None
1765
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1766
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
1767
+ db_utils.SQLAlchemyDialect.SQLITE.value):
1768
+ insert_func = sqlite.insert
1769
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
1770
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1771
+ insert_func = postgresql.insert
1772
+ else:
1773
+ raise ValueError('Unsupported database dialect')
1774
+ insert_stmt = insert_func(job_info_table).values(
1775
+ spot_job_id=job_id,
1776
+ name=name,
1777
+ schedule_state=ManagedJobScheduleState.INACTIVE.value,
1778
+ workspace=workspace,
1779
+ entrypoint=entrypoint,
1780
+ pool=pool,
1781
+ pool_hash=pool_hash,
1782
+ )
1783
+ session.execute(insert_stmt)
1784
+ session.commit()
1785
+
1786
+
1787
+ @_init_db
1788
+ def reset_jobs_for_recovery() -> None:
1789
+ """Remove controller PIDs for live jobs, allowing them to be recovered."""
1790
+ assert _SQLALCHEMY_ENGINE is not None
1791
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1792
+ session.query(job_info_table).filter(
1793
+ # PID should be set.
1794
+ job_info_table.c.controller_pid.isnot(None),
1795
+ # Schedule state should be alive.
1796
+ job_info_table.c.schedule_state.isnot(None),
1797
+ (job_info_table.c.schedule_state !=
1798
+ ManagedJobScheduleState.INVALID.value),
1799
+ (job_info_table.c.schedule_state !=
1800
+ ManagedJobScheduleState.WAITING.value),
1801
+ (job_info_table.c.schedule_state !=
1802
+ ManagedJobScheduleState.DONE.value),
1803
+ ).update({
1804
+ job_info_table.c.controller_pid: None,
1805
+ job_info_table.c.schedule_state:
1806
+ (ManagedJobScheduleState.WAITING.value)
1807
+ })
1808
+ session.commit()
1809
+
1810
+
1811
+ @_init_db
1812
+ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
1813
+ """Get all job ids by name."""
1814
+ assert _SQLALCHEMY_ENGINE is not None
1815
+
1816
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1817
+ query = sqlalchemy.select(
1818
+ spot_table.c.spot_job_id.distinct()).select_from(
1819
+ spot_table.outerjoin(
1820
+ job_info_table,
1821
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1822
+ if name is not None:
1823
+ # We match the job name from `job_info` for the jobs submitted after
1824
+ # #1982, and from `spot` for the jobs submitted before #1982, whose
1825
+ # job_info is not available.
1826
+ name_condition = sqlalchemy.or_(
1827
+ job_info_table.c.name == name,
1828
+ sqlalchemy.and_(job_info_table.c.name.is_(None),
1829
+ spot_table.c.task_name == name))
1830
+ query = query.where(name_condition)
1831
+ query = query.order_by(spot_table.c.spot_job_id.desc())
1832
+ rows = session.execute(query).fetchall()
1833
+ job_ids = [row[0] for row in rows if row[0] is not None]
1834
+ return job_ids