skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +35 -1
- sky/backends/cloud_vm_ray_backend.py +2 -2
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +18 -16
- sky/clouds/aws.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +732 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +702 -511
- sky/jobs/utils.py +94 -39
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +43 -24
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +8 -1
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +3 -1
- sky/skylet/events.py +2 -10
- sky/utils/command_runner.pyi +3 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
"""The database for managed jobs status."""
|
|
2
2
|
# TODO(zhwu): maybe use file based status instead of database, so
|
|
3
3
|
# that we can easily switch to a s3-based storage.
|
|
4
|
+
import asyncio
|
|
4
5
|
import enum
|
|
5
6
|
import functools
|
|
7
|
+
import ipaddress
|
|
6
8
|
import json
|
|
9
|
+
import sqlite3
|
|
7
10
|
import threading
|
|
8
11
|
import time
|
|
9
12
|
import typing
|
|
10
|
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
13
|
+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
|
|
14
|
+
import urllib.parse
|
|
11
15
|
|
|
12
16
|
import colorama
|
|
13
17
|
import sqlalchemy
|
|
@@ -15,27 +19,34 @@ from sqlalchemy import exc as sqlalchemy_exc
|
|
|
15
19
|
from sqlalchemy import orm
|
|
16
20
|
from sqlalchemy.dialects import postgresql
|
|
17
21
|
from sqlalchemy.dialects import sqlite
|
|
22
|
+
from sqlalchemy.ext import asyncio as sql_async
|
|
18
23
|
from sqlalchemy.ext import declarative
|
|
19
24
|
|
|
20
25
|
from sky import exceptions
|
|
21
26
|
from sky import sky_logging
|
|
27
|
+
from sky import skypilot_config
|
|
22
28
|
from sky.skylet import constants
|
|
23
29
|
from sky.utils import common_utils
|
|
30
|
+
from sky.utils import context_utils
|
|
24
31
|
from sky.utils.db import db_utils
|
|
25
32
|
from sky.utils.db import migration_utils
|
|
26
33
|
|
|
27
34
|
if typing.TYPE_CHECKING:
|
|
28
35
|
from sqlalchemy.engine import row
|
|
29
36
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
37
|
+
# Separate callback types for sync and async contexts
|
|
38
|
+
SyncCallbackType = Callable[[str], None]
|
|
39
|
+
AsyncCallbackType = Callable[[str], Awaitable[Any]]
|
|
40
|
+
CallbackType = Union[SyncCallbackType, AsyncCallbackType]
|
|
33
41
|
|
|
34
42
|
logger = sky_logging.init_logger(__name__)
|
|
35
43
|
|
|
36
44
|
_SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
|
|
45
|
+
_SQLALCHEMY_ENGINE_ASYNC: Optional[sql_async.AsyncEngine] = None
|
|
37
46
|
_SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
38
47
|
|
|
48
|
+
_DB_RETRY_TIMES = 30
|
|
49
|
+
|
|
39
50
|
Base = declarative.declarative_base()
|
|
40
51
|
|
|
41
52
|
# === Database schema ===
|
|
@@ -70,7 +81,7 @@ spot_table = sqlalchemy.Table(
|
|
|
70
81
|
sqlalchemy.Column('recovery_count', sqlalchemy.Integer, server_default='0'),
|
|
71
82
|
sqlalchemy.Column('job_duration', sqlalchemy.Float, server_default='0'),
|
|
72
83
|
sqlalchemy.Column('failure_reason', sqlalchemy.Text),
|
|
73
|
-
sqlalchemy.Column('spot_job_id', sqlalchemy.Integer),
|
|
84
|
+
sqlalchemy.Column('spot_job_id', sqlalchemy.Integer, index=True),
|
|
74
85
|
sqlalchemy.Column('task_id', sqlalchemy.Integer, server_default='0'),
|
|
75
86
|
sqlalchemy.Column('task_name', sqlalchemy.Text),
|
|
76
87
|
sqlalchemy.Column('specs', sqlalchemy.Text),
|
|
@@ -129,6 +140,7 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
129
140
|
try:
|
|
130
141
|
with orm.Session(engine) as session:
|
|
131
142
|
session.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
|
|
143
|
+
session.execute(sqlalchemy.text('PRAGMA synchronous=1'))
|
|
132
144
|
session.commit()
|
|
133
145
|
except sqlalchemy_exc.OperationalError as e:
|
|
134
146
|
if 'database is locked' not in str(e):
|
|
@@ -141,6 +153,43 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
141
153
|
migration_utils.SPOT_JOBS_VERSION)
|
|
142
154
|
|
|
143
155
|
|
|
156
|
+
def force_no_postgres() -> bool:
|
|
157
|
+
"""Force no postgres.
|
|
158
|
+
|
|
159
|
+
If the db is localhost on the api server, and we are not in consolidation
|
|
160
|
+
mode, we must force using sqlite and not using the api server on the jobs
|
|
161
|
+
controller.
|
|
162
|
+
"""
|
|
163
|
+
conn_string = skypilot_config.get_nested(('db',), None)
|
|
164
|
+
|
|
165
|
+
if conn_string:
|
|
166
|
+
parsed = urllib.parse.urlparse(conn_string)
|
|
167
|
+
# it freezes if we use the normal get_consolidation_mode function
|
|
168
|
+
consolidation_mode = skypilot_config.get_nested(
|
|
169
|
+
('jobs', 'controller', 'consolidation_mode'), default_value=False)
|
|
170
|
+
if ((parsed.hostname == 'localhost' or
|
|
171
|
+
ipaddress.ip_address(parsed.hostname).is_loopback) and
|
|
172
|
+
not consolidation_mode):
|
|
173
|
+
return True
|
|
174
|
+
return False
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def initialize_and_get_db_async() -> sql_async.AsyncEngine:
|
|
178
|
+
global _SQLALCHEMY_ENGINE_ASYNC
|
|
179
|
+
if _SQLALCHEMY_ENGINE_ASYNC is not None:
|
|
180
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
181
|
+
with _SQLALCHEMY_ENGINE_LOCK:
|
|
182
|
+
if _SQLALCHEMY_ENGINE_ASYNC is not None:
|
|
183
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
184
|
+
|
|
185
|
+
_SQLALCHEMY_ENGINE_ASYNC = db_utils.get_engine('spot_jobs',
|
|
186
|
+
async_engine=True)
|
|
187
|
+
|
|
188
|
+
# to create the table in case an async function gets called first
|
|
189
|
+
initialize_and_get_db()
|
|
190
|
+
return _SQLALCHEMY_ENGINE_ASYNC
|
|
191
|
+
|
|
192
|
+
|
|
144
193
|
# We wrap the sqlalchemy engine initialization in a thread
|
|
145
194
|
# lock to ensure that multiple threads do not initialize the
|
|
146
195
|
# engine which could result in a rare race condition where
|
|
@@ -149,7 +198,6 @@ def create_table(engine: sqlalchemy.engine.Engine):
|
|
|
149
198
|
# which could result in e1 being garbage collected unexpectedly.
|
|
150
199
|
def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
151
200
|
global _SQLALCHEMY_ENGINE
|
|
152
|
-
|
|
153
201
|
if _SQLALCHEMY_ENGINE is not None:
|
|
154
202
|
return _SQLALCHEMY_ENGINE
|
|
155
203
|
|
|
@@ -167,13 +215,58 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
|
|
|
167
215
|
return _SQLALCHEMY_ENGINE
|
|
168
216
|
|
|
169
217
|
|
|
218
|
+
def _init_db_async(func):
|
|
219
|
+
"""Initialize the async database. Add backoff to the function call."""
|
|
220
|
+
|
|
221
|
+
@functools.wraps(func)
|
|
222
|
+
async def wrapper(*args, **kwargs):
|
|
223
|
+
if _SQLALCHEMY_ENGINE_ASYNC is None:
|
|
224
|
+
# this may happen multiple times since there is no locking
|
|
225
|
+
# here but thats fine, this is just a short circuit for the
|
|
226
|
+
# common case.
|
|
227
|
+
await context_utils.to_thread(initialize_and_get_db_async)
|
|
228
|
+
|
|
229
|
+
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=5)
|
|
230
|
+
last_exc = None
|
|
231
|
+
for _ in range(_DB_RETRY_TIMES):
|
|
232
|
+
try:
|
|
233
|
+
return await func(*args, **kwargs)
|
|
234
|
+
except (sqlalchemy_exc.OperationalError,
|
|
235
|
+
asyncio.exceptions.TimeoutError, OSError,
|
|
236
|
+
sqlalchemy_exc.TimeoutError, sqlite3.OperationalError,
|
|
237
|
+
sqlalchemy_exc.InterfaceError, sqlite3.InterfaceError) as e:
|
|
238
|
+
last_exc = e
|
|
239
|
+
logger.debug(f'DB error: {last_exc}')
|
|
240
|
+
await asyncio.sleep(backoff.current_backoff())
|
|
241
|
+
raise last_exc
|
|
242
|
+
|
|
243
|
+
return wrapper
|
|
244
|
+
|
|
245
|
+
|
|
170
246
|
def _init_db(func):
|
|
171
|
-
"""Initialize the database."""
|
|
247
|
+
"""Initialize the database. Add backoff to the function call."""
|
|
172
248
|
|
|
173
249
|
@functools.wraps(func)
|
|
174
250
|
def wrapper(*args, **kwargs):
|
|
175
|
-
|
|
176
|
-
|
|
251
|
+
if _SQLALCHEMY_ENGINE is None:
|
|
252
|
+
# this may happen multiple times since there is no locking
|
|
253
|
+
# here but thats fine, this is just a short circuit for the
|
|
254
|
+
# common case.
|
|
255
|
+
initialize_and_get_db()
|
|
256
|
+
|
|
257
|
+
backoff = common_utils.Backoff(initial_backoff=1, max_backoff_factor=10)
|
|
258
|
+
last_exc = None
|
|
259
|
+
for _ in range(_DB_RETRY_TIMES):
|
|
260
|
+
try:
|
|
261
|
+
return func(*args, **kwargs)
|
|
262
|
+
except (sqlalchemy_exc.OperationalError,
|
|
263
|
+
asyncio.exceptions.TimeoutError, OSError,
|
|
264
|
+
sqlalchemy_exc.TimeoutError, sqlite3.OperationalError,
|
|
265
|
+
sqlalchemy_exc.InterfaceError, sqlite3.InterfaceError) as e:
|
|
266
|
+
last_exc = e
|
|
267
|
+
logger.debug(f'DB error: {last_exc}')
|
|
268
|
+
time.sleep(backoff.current_backoff())
|
|
269
|
+
raise last_exc
|
|
177
270
|
|
|
178
271
|
return wrapper
|
|
179
272
|
|
|
@@ -416,6 +509,10 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
416
509
|
# This job may have been created before scheduler was introduced in #4458.
|
|
417
510
|
# This state is not used by scheduler but just for backward compatibility.
|
|
418
511
|
# TODO(cooperc): remove this in v0.11.0
|
|
512
|
+
# TODO(luca): the only states we need are INACTIVE, WAITING, ALIVE, and
|
|
513
|
+
# DONE. ALIVE = old LAUNCHING + ALIVE + ALIVE_BACKOFF + ALIVE_WAITING and
|
|
514
|
+
# will represent jobs that are claimed by a controller. Delete the rest
|
|
515
|
+
# in v0.13.0
|
|
419
516
|
INVALID = None
|
|
420
517
|
# The job should be ignored by the scheduler.
|
|
421
518
|
INACTIVE = 'INACTIVE'
|
|
@@ -440,32 +537,6 @@ class ManagedJobScheduleState(enum.Enum):
|
|
|
440
537
|
|
|
441
538
|
|
|
442
539
|
# === Status transition functions ===
|
|
443
|
-
@_init_db
|
|
444
|
-
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
|
|
445
|
-
pool: Optional[str], pool_hash: Optional[str]):
|
|
446
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
447
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
448
|
-
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
449
|
-
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
450
|
-
insert_func = sqlite.insert
|
|
451
|
-
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
452
|
-
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
453
|
-
insert_func = postgresql.insert
|
|
454
|
-
else:
|
|
455
|
-
raise ValueError('Unsupported database dialect')
|
|
456
|
-
insert_stmt = insert_func(job_info_table).values(
|
|
457
|
-
spot_job_id=job_id,
|
|
458
|
-
name=name,
|
|
459
|
-
schedule_state=ManagedJobScheduleState.INACTIVE.value,
|
|
460
|
-
workspace=workspace,
|
|
461
|
-
entrypoint=entrypoint,
|
|
462
|
-
pool=pool,
|
|
463
|
-
pool_hash=pool_hash,
|
|
464
|
-
)
|
|
465
|
-
session.execute(insert_stmt)
|
|
466
|
-
session.commit()
|
|
467
|
-
|
|
468
|
-
|
|
469
540
|
@_init_db
|
|
470
541
|
def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
|
|
471
542
|
pool: Optional[str],
|
|
@@ -517,6 +588,7 @@ def set_pending(
|
|
|
517
588
|
):
|
|
518
589
|
"""Set the task to pending state."""
|
|
519
590
|
assert _SQLALCHEMY_ENGINE is not None
|
|
591
|
+
|
|
520
592
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
521
593
|
session.execute(
|
|
522
594
|
sqlalchemy.insert(spot_table).values(
|
|
@@ -530,76 +602,28 @@ def set_pending(
|
|
|
530
602
|
session.commit()
|
|
531
603
|
|
|
532
604
|
|
|
533
|
-
@
|
|
534
|
-
def
|
|
535
|
-
submit_time: float, resources_str: str,
|
|
536
|
-
specs: Dict[str, Union[str,
|
|
537
|
-
int]], callback_func: CallbackType):
|
|
538
|
-
"""Set the task to starting state.
|
|
539
|
-
|
|
540
|
-
Args:
|
|
541
|
-
job_id: The managed job ID.
|
|
542
|
-
task_id: The task ID.
|
|
543
|
-
run_timestamp: The run_timestamp of the run. This will be used to
|
|
544
|
-
determine the log directory of the managed task.
|
|
545
|
-
submit_time: The time when the managed task is submitted.
|
|
546
|
-
resources_str: The resources string of the managed task.
|
|
547
|
-
specs: The specs of the managed task.
|
|
548
|
-
callback_func: The callback function.
|
|
549
|
-
"""
|
|
550
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
551
|
-
# Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
|
|
552
|
-
# the log directory and submission time align with each other, so as to
|
|
553
|
-
# make it easier to find them based on one of the values.
|
|
554
|
-
# Also, using the earlier timestamp should be closer to the term
|
|
555
|
-
# `submit_at`, which represents the time the managed task is submitted.
|
|
556
|
-
logger.info('Launching the spot cluster...')
|
|
557
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
558
|
-
count = session.query(spot_table).filter(
|
|
559
|
-
sqlalchemy.and_(
|
|
560
|
-
spot_table.c.spot_job_id == job_id,
|
|
561
|
-
spot_table.c.task_id == task_id,
|
|
562
|
-
spot_table.c.status == ManagedJobStatus.PENDING.value,
|
|
563
|
-
spot_table.c.end_at.is_(None),
|
|
564
|
-
)).update({
|
|
565
|
-
spot_table.c.resources: resources_str,
|
|
566
|
-
spot_table.c.submitted_at: submit_time,
|
|
567
|
-
spot_table.c.status: ManagedJobStatus.STARTING.value,
|
|
568
|
-
spot_table.c.run_timestamp: run_timestamp,
|
|
569
|
-
spot_table.c.specs: json.dumps(specs),
|
|
570
|
-
})
|
|
571
|
-
session.commit()
|
|
572
|
-
if count != 1:
|
|
573
|
-
raise exceptions.ManagedJobStatusError(
|
|
574
|
-
'Failed to set the task to starting. '
|
|
575
|
-
f'({count} rows updated)')
|
|
576
|
-
# SUBMITTED is no longer used, but we keep it for backward compatibility.
|
|
577
|
-
# TODO(cooperc): remove this in v0.12.0
|
|
578
|
-
callback_func('SUBMITTED')
|
|
579
|
-
callback_func('STARTING')
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
@_init_db
|
|
583
|
-
def set_backoff_pending(job_id: int, task_id: int):
|
|
605
|
+
@_init_db_async
|
|
606
|
+
async def set_backoff_pending_async(job_id: int, task_id: int):
|
|
584
607
|
"""Set the task to PENDING state if it is in backoff.
|
|
585
608
|
|
|
586
609
|
This should only be used to transition from STARTING or RECOVERING back to
|
|
587
610
|
PENDING.
|
|
588
611
|
"""
|
|
589
|
-
assert
|
|
590
|
-
with
|
|
591
|
-
count = session.
|
|
592
|
-
sqlalchemy.
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
612
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
613
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
614
|
+
count = await session.execute(
|
|
615
|
+
sqlalchemy.update(spot_table).where(
|
|
616
|
+
sqlalchemy.and_(
|
|
617
|
+
spot_table.c.spot_job_id == job_id,
|
|
618
|
+
spot_table.c.task_id == task_id,
|
|
619
|
+
spot_table.c.status.in_([
|
|
620
|
+
ManagedJobStatus.STARTING.value,
|
|
621
|
+
ManagedJobStatus.RECOVERING.value
|
|
622
|
+
]),
|
|
623
|
+
spot_table.c.end_at.is_(None),
|
|
624
|
+
)).values({spot_table.c.status: ManagedJobStatus.PENDING.value})
|
|
625
|
+
)
|
|
626
|
+
await session.commit()
|
|
603
627
|
if count != 1:
|
|
604
628
|
raise exceptions.ManagedJobStatusError(
|
|
605
629
|
'Failed to set the task back to pending. '
|
|
@@ -608,7 +632,7 @@ def set_backoff_pending(job_id: int, task_id: int):
|
|
|
608
632
|
|
|
609
633
|
|
|
610
634
|
@_init_db
|
|
611
|
-
def
|
|
635
|
+
async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
|
|
612
636
|
"""Set the task back to STARTING or RECOVERING from PENDING.
|
|
613
637
|
|
|
614
638
|
This should not be used for the initial transition from PENDING to STARTING.
|
|
@@ -616,19 +640,20 @@ def set_restarting(job_id: int, task_id: int, recovering: bool):
|
|
|
616
640
|
after using set_backoff_pending to transition back to PENDING during
|
|
617
641
|
launch retry backoff.
|
|
618
642
|
"""
|
|
619
|
-
assert
|
|
643
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
620
644
|
target_status = ManagedJobStatus.STARTING.value
|
|
621
645
|
if recovering:
|
|
622
646
|
target_status = ManagedJobStatus.RECOVERING.value
|
|
623
|
-
with
|
|
624
|
-
|
|
625
|
-
sqlalchemy.
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
647
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
648
|
+
result = await session.execute(
|
|
649
|
+
sqlalchemy.update(spot_table).where(
|
|
650
|
+
sqlalchemy.and_(
|
|
651
|
+
spot_table.c.spot_job_id == job_id,
|
|
652
|
+
spot_table.c.task_id == task_id,
|
|
653
|
+
spot_table.c.end_at.is_(None),
|
|
654
|
+
)).values({spot_table.c.status: target_status}))
|
|
655
|
+
count = result.rowcount
|
|
656
|
+
await session.commit()
|
|
632
657
|
logger.debug(f'back to {target_status}')
|
|
633
658
|
if count != 1:
|
|
634
659
|
raise exceptions.ManagedJobStatusError(
|
|
@@ -638,137 +663,6 @@ def set_restarting(job_id: int, task_id: int, recovering: bool):
|
|
|
638
663
|
# initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
|
|
639
664
|
|
|
640
665
|
|
|
641
|
-
@_init_db
|
|
642
|
-
def set_started(job_id: int, task_id: int, start_time: float,
|
|
643
|
-
callback_func: CallbackType):
|
|
644
|
-
"""Set the task to started state."""
|
|
645
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
646
|
-
logger.info('Job started.')
|
|
647
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
648
|
-
count = session.query(spot_table).filter(
|
|
649
|
-
sqlalchemy.and_(
|
|
650
|
-
spot_table.c.spot_job_id == job_id,
|
|
651
|
-
spot_table.c.task_id == task_id,
|
|
652
|
-
spot_table.c.status.in_([
|
|
653
|
-
ManagedJobStatus.STARTING.value,
|
|
654
|
-
# If the task is empty, we will jump straight
|
|
655
|
-
# from PENDING to RUNNING
|
|
656
|
-
ManagedJobStatus.PENDING.value
|
|
657
|
-
]),
|
|
658
|
-
spot_table.c.end_at.is_(None),
|
|
659
|
-
)).update({
|
|
660
|
-
spot_table.c.status: ManagedJobStatus.RUNNING.value,
|
|
661
|
-
spot_table.c.start_at: start_time,
|
|
662
|
-
spot_table.c.last_recovered_at: start_time,
|
|
663
|
-
})
|
|
664
|
-
session.commit()
|
|
665
|
-
if count != 1:
|
|
666
|
-
raise exceptions.ManagedJobStatusError(
|
|
667
|
-
f'Failed to set the task to started. '
|
|
668
|
-
f'({count} rows updated)')
|
|
669
|
-
callback_func('STARTED')
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
@_init_db
|
|
673
|
-
def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
|
|
674
|
-
callback_func: CallbackType):
|
|
675
|
-
"""Set the task to recovering state, and update the job duration."""
|
|
676
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
677
|
-
logger.info('=== Recovering... ===')
|
|
678
|
-
# NOTE: if we are resuming from a controller failure and the previous status
|
|
679
|
-
# is STARTING, the initial value of `last_recovered_at` might not be set
|
|
680
|
-
# yet (default value -1). In this case, we should not add current timestamp.
|
|
681
|
-
# Otherwise, the job duration will be incorrect (~55 years from 1970).
|
|
682
|
-
current_time = time.time()
|
|
683
|
-
|
|
684
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
685
|
-
if force_transit_to_recovering:
|
|
686
|
-
# For the HA job controller, it is possible that the jobs came from
|
|
687
|
-
# any processing status to recovering. But it should not be any
|
|
688
|
-
# terminal status as such jobs will not be recovered; and it should
|
|
689
|
-
# not be CANCELLING as we will directly trigger a cleanup.
|
|
690
|
-
status_condition = spot_table.c.status.in_(
|
|
691
|
-
[s.value for s in ManagedJobStatus.processing_statuses()])
|
|
692
|
-
else:
|
|
693
|
-
status_condition = (
|
|
694
|
-
spot_table.c.status == ManagedJobStatus.RUNNING.value)
|
|
695
|
-
|
|
696
|
-
count = session.query(spot_table).filter(
|
|
697
|
-
sqlalchemy.and_(
|
|
698
|
-
spot_table.c.spot_job_id == job_id,
|
|
699
|
-
spot_table.c.task_id == task_id,
|
|
700
|
-
status_condition,
|
|
701
|
-
spot_table.c.end_at.is_(None),
|
|
702
|
-
)).update({
|
|
703
|
-
spot_table.c.status: ManagedJobStatus.RECOVERING.value,
|
|
704
|
-
spot_table.c.job_duration: sqlalchemy.case(
|
|
705
|
-
(spot_table.c.last_recovered_at >= 0,
|
|
706
|
-
spot_table.c.job_duration + current_time -
|
|
707
|
-
spot_table.c.last_recovered_at),
|
|
708
|
-
else_=spot_table.c.job_duration),
|
|
709
|
-
spot_table.c.last_recovered_at: sqlalchemy.case(
|
|
710
|
-
(spot_table.c.last_recovered_at < 0, current_time),
|
|
711
|
-
else_=spot_table.c.last_recovered_at),
|
|
712
|
-
})
|
|
713
|
-
session.commit()
|
|
714
|
-
if count != 1:
|
|
715
|
-
raise exceptions.ManagedJobStatusError(
|
|
716
|
-
f'Failed to set the task to recovering. '
|
|
717
|
-
f'({count} rows updated)')
|
|
718
|
-
callback_func('RECOVERING')
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
@_init_db
|
|
722
|
-
def set_recovered(job_id: int, task_id: int, recovered_time: float,
|
|
723
|
-
callback_func: CallbackType):
|
|
724
|
-
"""Set the task to recovered."""
|
|
725
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
726
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
727
|
-
count = session.query(spot_table).filter(
|
|
728
|
-
sqlalchemy.and_(
|
|
729
|
-
spot_table.c.spot_job_id == job_id,
|
|
730
|
-
spot_table.c.task_id == task_id,
|
|
731
|
-
spot_table.c.status == ManagedJobStatus.RECOVERING.value,
|
|
732
|
-
spot_table.c.end_at.is_(None),
|
|
733
|
-
)).update({
|
|
734
|
-
spot_table.c.status: ManagedJobStatus.RUNNING.value,
|
|
735
|
-
spot_table.c.last_recovered_at: recovered_time,
|
|
736
|
-
spot_table.c.recovery_count: spot_table.c.recovery_count + 1,
|
|
737
|
-
})
|
|
738
|
-
session.commit()
|
|
739
|
-
if count != 1:
|
|
740
|
-
raise exceptions.ManagedJobStatusError(
|
|
741
|
-
f'Failed to set the task to recovered. '
|
|
742
|
-
f'({count} rows updated)')
|
|
743
|
-
logger.info('==== Recovered. ====')
|
|
744
|
-
callback_func('RECOVERED')
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
@_init_db
|
|
748
|
-
def set_succeeded(job_id: int, task_id: int, end_time: float,
|
|
749
|
-
callback_func: CallbackType):
|
|
750
|
-
"""Set the task to succeeded, if it is in a non-terminal state."""
|
|
751
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
752
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
753
|
-
count = session.query(spot_table).filter(
|
|
754
|
-
sqlalchemy.and_(
|
|
755
|
-
spot_table.c.spot_job_id == job_id,
|
|
756
|
-
spot_table.c.task_id == task_id,
|
|
757
|
-
spot_table.c.status == ManagedJobStatus.RUNNING.value,
|
|
758
|
-
spot_table.c.end_at.is_(None),
|
|
759
|
-
)).update({
|
|
760
|
-
spot_table.c.status: ManagedJobStatus.SUCCEEDED.value,
|
|
761
|
-
spot_table.c.end_at: end_time,
|
|
762
|
-
})
|
|
763
|
-
session.commit()
|
|
764
|
-
if count != 1:
|
|
765
|
-
raise exceptions.ManagedJobStatusError(
|
|
766
|
-
f'Failed to set the task to succeeded. '
|
|
767
|
-
f'({count} rows updated)')
|
|
768
|
-
callback_func('SUCCEEDED')
|
|
769
|
-
logger.info('Job succeeded.')
|
|
770
|
-
|
|
771
|
-
|
|
772
666
|
@_init_db
|
|
773
667
|
def set_failed(
|
|
774
668
|
job_id: int,
|
|
@@ -834,51 +728,30 @@ def set_failed(
|
|
|
834
728
|
|
|
835
729
|
|
|
836
730
|
@_init_db
|
|
837
|
-
def
|
|
838
|
-
"""Set
|
|
839
|
-
|
|
840
|
-
task_id is not needed, because we expect the job should be cancelled
|
|
841
|
-
as a whole, and we should not cancel a single task.
|
|
842
|
-
"""
|
|
731
|
+
def set_pending_cancelled(job_id: int):
|
|
732
|
+
"""Set the job as pending cancelled, if it is in non-terminal states."""
|
|
843
733
|
assert _SQLALCHEMY_ENGINE is not None
|
|
844
734
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
845
|
-
|
|
846
|
-
|
|
735
|
+
# Subquery to get the spot_job_ids that match the joined condition
|
|
736
|
+
subquery = session.query(spot_table.c.job_id).join(
|
|
737
|
+
job_info_table,
|
|
738
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id).filter(
|
|
847
739
|
spot_table.c.spot_job_id == job_id,
|
|
848
|
-
spot_table.c.
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
logger.info('Cancellation skipped, job is already terminal')
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
@_init_db
|
|
860
|
-
def set_cancelled(job_id: int, callback_func: CallbackType):
|
|
861
|
-
"""Set tasks in the job as cancelled, if they are in CANCELLING state.
|
|
740
|
+
spot_table.c.status == ManagedJobStatus.PENDING.value,
|
|
741
|
+
sqlalchemy.or_(
|
|
742
|
+
job_info_table.c.schedule_state ==
|
|
743
|
+
ManagedJobScheduleState.WAITING.value,
|
|
744
|
+
job_info_table.c.schedule_state ==
|
|
745
|
+
ManagedJobScheduleState.INACTIVE.value,
|
|
746
|
+
),
|
|
747
|
+
).subquery()
|
|
862
748
|
|
|
863
|
-
The set_cancelling should be called before this function.
|
|
864
|
-
"""
|
|
865
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
866
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
867
749
|
count = session.query(spot_table).filter(
|
|
868
|
-
|
|
869
|
-
spot_table.c.
|
|
870
|
-
|
|
871
|
-
)).update({
|
|
872
|
-
spot_table.c.status: ManagedJobStatus.CANCELLED.value,
|
|
873
|
-
spot_table.c.end_at: time.time(),
|
|
874
|
-
})
|
|
750
|
+
spot_table.c.job_id.in_(subquery)).update(
|
|
751
|
+
{spot_table.c.status: ManagedJobStatus.CANCELLED.value},
|
|
752
|
+
synchronize_session=False)
|
|
875
753
|
session.commit()
|
|
876
|
-
|
|
877
|
-
if updated:
|
|
878
|
-
logger.info('Job cancelled.')
|
|
879
|
-
callback_func('CANCELLED')
|
|
880
|
-
else:
|
|
881
|
-
logger.info('Cancellation skipped, job is not CANCELLING')
|
|
754
|
+
return count > 0
|
|
882
755
|
|
|
883
756
|
|
|
884
757
|
@_init_db
|
|
@@ -936,45 +809,6 @@ def get_nonterminal_job_ids_by_name(name: Optional[str],
|
|
|
936
809
|
return job_ids
|
|
937
810
|
|
|
938
811
|
|
|
939
|
-
@_init_db
|
|
940
|
-
def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
|
|
941
|
-
"""Get jobs from the database that have a live schedule_state.
|
|
942
|
-
|
|
943
|
-
This should return job(s) that are not INACTIVE, WAITING, or DONE. So a
|
|
944
|
-
returned job should correspond to a live job controller process, with one
|
|
945
|
-
exception: the job may have just transitioned from WAITING to LAUNCHING, but
|
|
946
|
-
the controller process has not yet started.
|
|
947
|
-
"""
|
|
948
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
949
|
-
|
|
950
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
951
|
-
query = sqlalchemy.select(
|
|
952
|
-
job_info_table.c.spot_job_id,
|
|
953
|
-
job_info_table.c.schedule_state,
|
|
954
|
-
job_info_table.c.controller_pid,
|
|
955
|
-
).where(~job_info_table.c.schedule_state.in_([
|
|
956
|
-
ManagedJobScheduleState.INACTIVE.value,
|
|
957
|
-
ManagedJobScheduleState.WAITING.value,
|
|
958
|
-
ManagedJobScheduleState.DONE.value,
|
|
959
|
-
]))
|
|
960
|
-
|
|
961
|
-
if job_id is not None:
|
|
962
|
-
query = query.where(job_info_table.c.spot_job_id == job_id)
|
|
963
|
-
|
|
964
|
-
query = query.order_by(job_info_table.c.spot_job_id.desc())
|
|
965
|
-
|
|
966
|
-
rows = session.execute(query).fetchall()
|
|
967
|
-
jobs = []
|
|
968
|
-
for row in rows:
|
|
969
|
-
job_dict = {
|
|
970
|
-
'job_id': row[0],
|
|
971
|
-
'schedule_state': ManagedJobScheduleState(row[1]),
|
|
972
|
-
'controller_pid': row[2],
|
|
973
|
-
}
|
|
974
|
-
jobs.append(job_dict)
|
|
975
|
-
return jobs
|
|
976
|
-
|
|
977
|
-
|
|
978
812
|
@_init_db
|
|
979
813
|
def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
|
|
980
814
|
"""Get jobs that need controller process checking.
|
|
@@ -1035,32 +869,6 @@ def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
|
|
|
1035
869
|
return [row[0] for row in rows if row[0] is not None]
|
|
1036
870
|
|
|
1037
871
|
|
|
1038
|
-
@_init_db
|
|
1039
|
-
def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
|
1040
|
-
"""Get all job ids by name."""
|
|
1041
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
1042
|
-
|
|
1043
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1044
|
-
query = sqlalchemy.select(
|
|
1045
|
-
spot_table.c.spot_job_id.distinct()).select_from(
|
|
1046
|
-
spot_table.outerjoin(
|
|
1047
|
-
job_info_table,
|
|
1048
|
-
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1049
|
-
if name is not None:
|
|
1050
|
-
# We match the job name from `job_info` for the jobs submitted after
|
|
1051
|
-
# #1982, and from `spot` for the jobs submitted before #1982, whose
|
|
1052
|
-
# job_info is not available.
|
|
1053
|
-
name_condition = sqlalchemy.or_(
|
|
1054
|
-
job_info_table.c.name == name,
|
|
1055
|
-
sqlalchemy.and_(job_info_table.c.name.is_(None),
|
|
1056
|
-
spot_table.c.task_name == name))
|
|
1057
|
-
query = query.where(name_condition)
|
|
1058
|
-
query = query.order_by(spot_table.c.spot_job_id.desc())
|
|
1059
|
-
rows = session.execute(query).fetchall()
|
|
1060
|
-
job_ids = [row[0] for row in rows if row[0] is not None]
|
|
1061
|
-
return job_ids
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
872
|
@_init_db
|
|
1065
873
|
def _get_all_task_ids_statuses(
|
|
1066
874
|
job_id: int) -> List[Tuple[int, ManagedJobStatus]]:
|
|
@@ -1092,18 +900,6 @@ def get_all_task_ids_names_statuses_logs(
|
|
|
1092
900
|
for row in id_names]
|
|
1093
901
|
|
|
1094
902
|
|
|
1095
|
-
@_init_db
|
|
1096
|
-
def get_job_status_with_task_id(job_id: int,
|
|
1097
|
-
task_id: int) -> Optional[ManagedJobStatus]:
|
|
1098
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
1099
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1100
|
-
status = session.execute(
|
|
1101
|
-
sqlalchemy.select(spot_table.c.status).where(
|
|
1102
|
-
sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
|
|
1103
|
-
spot_table.c.task_id == task_id))).fetchone()
|
|
1104
|
-
return ManagedJobStatus(status[0]) if status else None
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
903
|
def get_num_tasks(job_id: int) -> int:
|
|
1108
904
|
return len(_get_all_task_ids_statuses(job_id))
|
|
1109
905
|
|
|
@@ -1131,6 +927,16 @@ def get_latest_task_id_status(
|
|
|
1131
927
|
return task_id, status
|
|
1132
928
|
|
|
1133
929
|
|
|
930
|
+
@_init_db
|
|
931
|
+
def get_job_controller_pid(job_id: int) -> Optional[int]:
|
|
932
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
933
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
934
|
+
pid = session.execute(
|
|
935
|
+
sqlalchemy.select(job_info_table.c.controller_pid).where(
|
|
936
|
+
job_info_table.c.spot_job_id == job_id)).fetchone()
|
|
937
|
+
return pid[0] if pid else None
|
|
938
|
+
|
|
939
|
+
|
|
1134
940
|
def get_status(job_id: int) -> Optional[ManagedJobStatus]:
|
|
1135
941
|
_, status = get_latest_task_id_status(job_id)
|
|
1136
942
|
return status
|
|
@@ -1243,30 +1049,10 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
|
|
|
1243
1049
|
|
|
1244
1050
|
|
|
1245
1051
|
@_init_db
|
|
1246
|
-
def
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1251
|
-
where_conditions = [spot_table.c.spot_job_id == job_id]
|
|
1252
|
-
if task_id is not None:
|
|
1253
|
-
where_conditions.append(spot_table.c.task_id == task_id)
|
|
1254
|
-
local_log_file = session.execute(
|
|
1255
|
-
sqlalchemy.select(spot_table.c.local_log_file).where(
|
|
1256
|
-
sqlalchemy.and_(*where_conditions))).fetchone()
|
|
1257
|
-
return local_log_file[-1] if local_log_file else None
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
# === Scheduler state functions ===
|
|
1261
|
-
# Only the scheduler should call these functions. They may require holding the
|
|
1262
|
-
# scheduler lock to work correctly.
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
@_init_db
|
|
1266
|
-
def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
1267
|
-
original_user_yaml_path: str, env_file_path: str,
|
|
1268
|
-
user_hash: str, priority: int) -> bool:
|
|
1269
|
-
"""Do not call without holding the scheduler lock.
|
|
1052
|
+
def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
1053
|
+
original_user_yaml_path: str, env_file_path: str,
|
|
1054
|
+
user_hash: str, priority: int):
|
|
1055
|
+
"""Do not call without holding the scheduler lock.
|
|
1270
1056
|
|
|
1271
1057
|
Returns: Whether this is a recovery run or not.
|
|
1272
1058
|
If this is a recovery run, the job may already be in the WAITING
|
|
@@ -1277,11 +1063,7 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
|
1277
1063
|
assert _SQLALCHEMY_ENGINE is not None
|
|
1278
1064
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1279
1065
|
updated_count = session.query(job_info_table).filter(
|
|
1280
|
-
sqlalchemy.and_(
|
|
1281
|
-
job_info_table.c.spot_job_id == job_id,
|
|
1282
|
-
job_info_table.c.schedule_state ==
|
|
1283
|
-
ManagedJobScheduleState.INACTIVE.value,
|
|
1284
|
-
)
|
|
1066
|
+
sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
|
|
1285
1067
|
).update({
|
|
1286
1068
|
job_info_table.c.schedule_state:
|
|
1287
1069
|
ManagedJobScheduleState.WAITING.value,
|
|
@@ -1292,9 +1074,7 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
|
1292
1074
|
job_info_table.c.priority: priority,
|
|
1293
1075
|
})
|
|
1294
1076
|
session.commit()
|
|
1295
|
-
# For a recovery run, the job may already be in the WAITING state.
|
|
1296
1077
|
assert updated_count <= 1, (job_id, updated_count)
|
|
1297
|
-
return updated_count == 0
|
|
1298
1078
|
|
|
1299
1079
|
|
|
1300
1080
|
@_init_db
|
|
@@ -1319,17 +1099,15 @@ def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
|
1319
1099
|
session.commit()
|
|
1320
1100
|
|
|
1321
1101
|
|
|
1322
|
-
@
|
|
1323
|
-
def
|
|
1324
|
-
|
|
1102
|
+
@_init_db_async
|
|
1103
|
+
async def set_job_id_on_pool_cluster_async(job_id: int,
|
|
1104
|
+
job_id_on_pool_cluster: int) -> None:
|
|
1325
1105
|
"""Set the job id on the pool cluster for a job."""
|
|
1326
|
-
assert
|
|
1327
|
-
with
|
|
1328
|
-
session.
|
|
1329
|
-
job_info_table.c.
|
|
1330
|
-
|
|
1331
|
-
})
|
|
1332
|
-
session.commit()
|
|
1106
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1107
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1108
|
+
await session.execute(job_info_table.c.spot_job_id == job_id).update(
|
|
1109
|
+
{job_info_table.c.job_id_on_pool_cluster: job_id_on_pool_cluster})
|
|
1110
|
+
await session.commit()
|
|
1333
1111
|
|
|
1334
1112
|
|
|
1335
1113
|
@_init_db
|
|
@@ -1347,77 +1125,54 @@ def get_pool_submit_info(job_id: int) -> Tuple[Optional[str], Optional[int]]:
|
|
|
1347
1125
|
return info[0], info[1]
|
|
1348
1126
|
|
|
1349
1127
|
|
|
1350
|
-
@
|
|
1351
|
-
def
|
|
1352
|
-
|
|
1353
|
-
"""
|
|
1354
|
-
assert
|
|
1355
|
-
with orm.Session(
|
|
1356
|
-
|
|
1357
|
-
sqlalchemy.
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
session.commit()
|
|
1365
|
-
assert updated_count == 1, (job_id, updated_count)
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
@_init_db
|
|
1369
|
-
def scheduler_set_alive(job_id: int) -> None:
|
|
1370
|
-
"""Do not call without holding the scheduler lock."""
|
|
1371
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
1372
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1373
|
-
updated_count = session.query(job_info_table).filter(
|
|
1374
|
-
sqlalchemy.and_(
|
|
1375
|
-
job_info_table.c.spot_job_id == job_id,
|
|
1376
|
-
job_info_table.c.schedule_state ==
|
|
1377
|
-
ManagedJobScheduleState.LAUNCHING.value,
|
|
1378
|
-
)).update({
|
|
1379
|
-
job_info_table.c.schedule_state:
|
|
1380
|
-
ManagedJobScheduleState.ALIVE.value
|
|
1381
|
-
})
|
|
1382
|
-
session.commit()
|
|
1383
|
-
assert updated_count == 1, (job_id, updated_count)
|
|
1128
|
+
@_init_db_async
|
|
1129
|
+
async def get_pool_submit_info_async(
|
|
1130
|
+
job_id: int) -> Tuple[Optional[str], Optional[int]]:
|
|
1131
|
+
"""Get the cluster name and job id on the pool from the managed job id."""
|
|
1132
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1133
|
+
async with orm.Session(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1134
|
+
info = await session.execute(
|
|
1135
|
+
sqlalchemy.select(job_info_table.c.current_cluster_name,
|
|
1136
|
+
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1137
|
+
job_info_table.c.spot_job_id == job_id)
|
|
1138
|
+
).fetchone()
|
|
1139
|
+
if info is None:
|
|
1140
|
+
return None, None
|
|
1141
|
+
return info[0], info[1]
|
|
1384
1142
|
|
|
1385
1143
|
|
|
1386
|
-
@
|
|
1387
|
-
def
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
ManagedJobScheduleState.ALIVE_BACKOFF.value
|
|
1399
|
-
})
|
|
1400
|
-
session.commit()
|
|
1401
|
-
assert updated_count == 1, (job_id, updated_count)
|
|
1144
|
+
@_init_db_async
|
|
1145
|
+
async def scheduler_set_launching_async(job_id: int):
|
|
1146
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1147
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1148
|
+
await session.execute(
|
|
1149
|
+
sqlalchemy.update(job_info_table).where(
|
|
1150
|
+
sqlalchemy.and_(job_info_table.c.spot_job_id == job_id)).values(
|
|
1151
|
+
{
|
|
1152
|
+
job_info_table.c.schedule_state:
|
|
1153
|
+
ManagedJobScheduleState.LAUNCHING.value
|
|
1154
|
+
}))
|
|
1155
|
+
await session.commit()
|
|
1402
1156
|
|
|
1403
1157
|
|
|
1404
|
-
@
|
|
1405
|
-
def
|
|
1158
|
+
@_init_db_async
|
|
1159
|
+
async def scheduler_set_alive_async(job_id: int) -> None:
|
|
1406
1160
|
"""Do not call without holding the scheduler lock."""
|
|
1407
|
-
assert
|
|
1408
|
-
with
|
|
1409
|
-
|
|
1410
|
-
sqlalchemy.
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
ManagedJobScheduleState.
|
|
1415
|
-
|
|
1161
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1162
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1163
|
+
result = await session.execute(
|
|
1164
|
+
sqlalchemy.update(job_info_table).where(
|
|
1165
|
+
sqlalchemy.and_(
|
|
1166
|
+
job_info_table.c.spot_job_id == job_id,
|
|
1167
|
+
job_info_table.c.schedule_state ==
|
|
1168
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1169
|
+
)).values({
|
|
1416
1170
|
job_info_table.c.schedule_state:
|
|
1417
|
-
ManagedJobScheduleState.
|
|
1418
|
-
})
|
|
1419
|
-
|
|
1420
|
-
|
|
1171
|
+
ManagedJobScheduleState.ALIVE.value
|
|
1172
|
+
}))
|
|
1173
|
+
changes = result.rowcount
|
|
1174
|
+
await session.commit()
|
|
1175
|
+
assert changes == 1, (job_id, changes)
|
|
1421
1176
|
|
|
1422
1177
|
|
|
1423
1178
|
@_init_db
|
|
@@ -1439,16 +1194,6 @@ def scheduler_set_done(job_id: int, idempotent: bool = False) -> None:
|
|
|
1439
1194
|
assert updated_count == 1, (job_id, updated_count)
|
|
1440
1195
|
|
|
1441
1196
|
|
|
1442
|
-
@_init_db
|
|
1443
|
-
def set_job_controller_pid(job_id: int, pid: int):
|
|
1444
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
1445
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1446
|
-
updated_count = session.query(job_info_table).filter_by(
|
|
1447
|
-
spot_job_id=job_id).update({job_info_table.c.controller_pid: pid})
|
|
1448
|
-
session.commit()
|
|
1449
|
-
assert updated_count == 1, (job_id, updated_count)
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
1197
|
@_init_db
|
|
1453
1198
|
def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState:
|
|
1454
1199
|
assert _SQLALCHEMY_ENGINE is not None
|
|
@@ -1527,58 +1272,78 @@ def get_nonterminal_job_ids_by_pool(pool: str,
|
|
|
1527
1272
|
return job_ids
|
|
1528
1273
|
|
|
1529
1274
|
|
|
1530
|
-
@
|
|
1531
|
-
def
|
|
1275
|
+
@_init_db_async
|
|
1276
|
+
async def get_waiting_job_async(pid: int) -> Optional[Dict[str, Any]]:
|
|
1532
1277
|
"""Get the next job that should transition to LAUNCHING.
|
|
1533
1278
|
|
|
1534
|
-
Selects the highest-priority WAITING or ALIVE_WAITING job
|
|
1535
|
-
|
|
1536
|
-
|
|
1279
|
+
Selects the highest-priority WAITING or ALIVE_WAITING job and atomically
|
|
1280
|
+
transitions it to LAUNCHING state to prevent race conditions.
|
|
1281
|
+
|
|
1282
|
+
Returns the job information if a job was successfully transitioned to
|
|
1283
|
+
LAUNCHING, or None if no suitable job was found.
|
|
1537
1284
|
|
|
1538
1285
|
Backwards compatibility note: jobs submitted before #4485 will have no
|
|
1539
1286
|
schedule_state and will be ignored by this SQL query.
|
|
1540
1287
|
"""
|
|
1541
|
-
assert
|
|
1542
|
-
with
|
|
1543
|
-
#
|
|
1544
|
-
|
|
1545
|
-
# ALIVE_BACKOFF job's priority.
|
|
1546
|
-
# First, get the max priority of LAUNCHING or ALIVE_BACKOFF jobs
|
|
1547
|
-
max_priority_subquery = sqlalchemy.select(
|
|
1548
|
-
sqlalchemy.func.max(job_info_table.c.priority)).where(
|
|
1549
|
-
job_info_table.c.schedule_state.in_([
|
|
1550
|
-
ManagedJobScheduleState.LAUNCHING.value,
|
|
1551
|
-
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1552
|
-
])).scalar_subquery()
|
|
1553
|
-
# Main query for waiting jobs
|
|
1554
|
-
select_conds = [
|
|
1555
|
-
job_info_table.c.schedule_state.in_([
|
|
1556
|
-
ManagedJobScheduleState.WAITING.value,
|
|
1557
|
-
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1558
|
-
]),
|
|
1559
|
-
job_info_table.c.priority >= sqlalchemy.func.coalesce(
|
|
1560
|
-
max_priority_subquery, 0),
|
|
1561
|
-
]
|
|
1562
|
-
query = sqlalchemy.select(
|
|
1288
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1289
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1290
|
+
# Select the highest priority waiting job for update (locks the row)
|
|
1291
|
+
select_query = sqlalchemy.select(
|
|
1563
1292
|
job_info_table.c.spot_job_id,
|
|
1564
1293
|
job_info_table.c.schedule_state,
|
|
1565
1294
|
job_info_table.c.dag_yaml_path,
|
|
1566
1295
|
job_info_table.c.env_file_path,
|
|
1296
|
+
job_info_table.c.controller_pid,
|
|
1567
1297
|
job_info_table.c.pool,
|
|
1568
|
-
).where(
|
|
1569
|
-
job_info_table.c.
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1298
|
+
).where(
|
|
1299
|
+
job_info_table.c.schedule_state.in_([
|
|
1300
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1301
|
+
])).order_by(
|
|
1302
|
+
job_info_table.c.priority.desc(),
|
|
1303
|
+
job_info_table.c.spot_job_id.asc(),
|
|
1304
|
+
).limit(1).with_for_update()
|
|
1305
|
+
|
|
1306
|
+
# Execute the select with row locking
|
|
1307
|
+
result = await session.execute(select_query)
|
|
1308
|
+
waiting_job_row = result.fetchone()
|
|
1309
|
+
|
|
1573
1310
|
if waiting_job_row is None:
|
|
1574
1311
|
return None
|
|
1575
1312
|
|
|
1313
|
+
job_id = waiting_job_row[0]
|
|
1314
|
+
current_state = ManagedJobScheduleState(waiting_job_row[1])
|
|
1315
|
+
dag_yaml_path = waiting_job_row[2]
|
|
1316
|
+
env_file_path = waiting_job_row[3]
|
|
1317
|
+
controller_pid = waiting_job_row[4]
|
|
1318
|
+
pool = waiting_job_row[5]
|
|
1319
|
+
|
|
1320
|
+
# Update the job state to LAUNCHING
|
|
1321
|
+
update_result = await session.execute(
|
|
1322
|
+
sqlalchemy.update(job_info_table).where(
|
|
1323
|
+
sqlalchemy.and_(
|
|
1324
|
+
job_info_table.c.spot_job_id == job_id,
|
|
1325
|
+
job_info_table.c.schedule_state == current_state.value,
|
|
1326
|
+
)).values({
|
|
1327
|
+
job_info_table.c.schedule_state:
|
|
1328
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1329
|
+
job_info_table.c.controller_pid: pid,
|
|
1330
|
+
}))
|
|
1331
|
+
|
|
1332
|
+
if update_result.rowcount != 1:
|
|
1333
|
+
# Update failed, rollback and return None
|
|
1334
|
+
await session.rollback()
|
|
1335
|
+
return None
|
|
1336
|
+
|
|
1337
|
+
# Commit the transaction
|
|
1338
|
+
await session.commit()
|
|
1339
|
+
|
|
1576
1340
|
return {
|
|
1577
|
-
'job_id':
|
|
1578
|
-
'schedule_state':
|
|
1579
|
-
'dag_yaml_path':
|
|
1580
|
-
'env_file_path':
|
|
1581
|
-
'
|
|
1341
|
+
'job_id': job_id,
|
|
1342
|
+
'schedule_state': current_state,
|
|
1343
|
+
'dag_yaml_path': dag_yaml_path,
|
|
1344
|
+
'env_file_path': env_file_path,
|
|
1345
|
+
'old_pid': controller_pid,
|
|
1346
|
+
'pool': pool,
|
|
1582
1347
|
}
|
|
1583
1348
|
|
|
1584
1349
|
|
|
@@ -1641,3 +1406,429 @@ def remove_ha_recovery_script(job_id: int) -> None:
|
|
|
1641
1406
|
session.query(ha_recovery_script_table).filter_by(
|
|
1642
1407
|
job_id=job_id).delete()
|
|
1643
1408
|
session.commit()
|
|
1409
|
+
|
|
1410
|
+
|
|
1411
|
+
@_init_db_async
|
|
1412
|
+
async def get_latest_task_id_status_async(
|
|
1413
|
+
job_id: int) -> Union[Tuple[int, ManagedJobStatus], Tuple[None, None]]:
|
|
1414
|
+
"""Returns the (task id, status) of the latest task of a job."""
|
|
1415
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1416
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1417
|
+
result = await session.execute(
|
|
1418
|
+
sqlalchemy.select(
|
|
1419
|
+
spot_table.c.task_id,
|
|
1420
|
+
spot_table.c.status,
|
|
1421
|
+
).where(spot_table.c.spot_job_id == job_id).order_by(
|
|
1422
|
+
spot_table.c.task_id.asc()))
|
|
1423
|
+
id_statuses = [
|
|
1424
|
+
(row[0], ManagedJobStatus(row[1])) for row in result.fetchall()
|
|
1425
|
+
]
|
|
1426
|
+
|
|
1427
|
+
if not id_statuses:
|
|
1428
|
+
return None, None
|
|
1429
|
+
task_id, status = next(
|
|
1430
|
+
((tid, st) for tid, st in id_statuses if not st.is_terminal()),
|
|
1431
|
+
id_statuses[-1],
|
|
1432
|
+
)
|
|
1433
|
+
return task_id, status
|
|
1434
|
+
|
|
1435
|
+
|
|
1436
|
+
@_init_db_async
|
|
1437
|
+
async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
|
|
1438
|
+
submit_time: float, resources_str: str,
|
|
1439
|
+
specs: Dict[str, Union[str, int]],
|
|
1440
|
+
callback_func: AsyncCallbackType):
|
|
1441
|
+
"""Set the task to starting state."""
|
|
1442
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1443
|
+
logger.info('Launching the spot cluster...')
|
|
1444
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1445
|
+
result = await session.execute(
|
|
1446
|
+
sqlalchemy.update(spot_table).where(
|
|
1447
|
+
sqlalchemy.and_(
|
|
1448
|
+
spot_table.c.spot_job_id == job_id,
|
|
1449
|
+
spot_table.c.task_id == task_id,
|
|
1450
|
+
spot_table.c.status == ManagedJobStatus.PENDING.value,
|
|
1451
|
+
spot_table.c.end_at.is_(None),
|
|
1452
|
+
)).values({
|
|
1453
|
+
spot_table.c.resources: resources_str,
|
|
1454
|
+
spot_table.c.submitted_at: submit_time,
|
|
1455
|
+
spot_table.c.status: ManagedJobStatus.STARTING.value,
|
|
1456
|
+
spot_table.c.run_timestamp: run_timestamp,
|
|
1457
|
+
spot_table.c.specs: json.dumps(specs),
|
|
1458
|
+
}))
|
|
1459
|
+
count = result.rowcount
|
|
1460
|
+
await session.commit()
|
|
1461
|
+
if count != 1:
|
|
1462
|
+
raise exceptions.ManagedJobStatusError(
|
|
1463
|
+
'Failed to set the task to starting. '
|
|
1464
|
+
f'({count} rows updated)')
|
|
1465
|
+
await callback_func('SUBMITTED')
|
|
1466
|
+
await callback_func('STARTING')
|
|
1467
|
+
|
|
1468
|
+
|
|
1469
|
+
@_init_db_async
|
|
1470
|
+
async def set_started_async(job_id: int, task_id: int, start_time: float,
|
|
1471
|
+
callback_func: AsyncCallbackType):
|
|
1472
|
+
"""Set the task to started state."""
|
|
1473
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1474
|
+
logger.info('Job started.')
|
|
1475
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1476
|
+
result = await session.execute(
|
|
1477
|
+
sqlalchemy.update(spot_table).where(
|
|
1478
|
+
sqlalchemy.and_(
|
|
1479
|
+
spot_table.c.spot_job_id == job_id,
|
|
1480
|
+
spot_table.c.task_id == task_id,
|
|
1481
|
+
spot_table.c.status.in_([
|
|
1482
|
+
ManagedJobStatus.STARTING.value,
|
|
1483
|
+
ManagedJobStatus.PENDING.value
|
|
1484
|
+
]),
|
|
1485
|
+
spot_table.c.end_at.is_(None),
|
|
1486
|
+
)).values({
|
|
1487
|
+
spot_table.c.status: ManagedJobStatus.RUNNING.value,
|
|
1488
|
+
spot_table.c.start_at: start_time,
|
|
1489
|
+
spot_table.c.last_recovered_at: start_time,
|
|
1490
|
+
}))
|
|
1491
|
+
count = result.rowcount
|
|
1492
|
+
await session.commit()
|
|
1493
|
+
if count != 1:
|
|
1494
|
+
raise exceptions.ManagedJobStatusError(
|
|
1495
|
+
f'Failed to set the task to started. '
|
|
1496
|
+
f'({count} rows updated)')
|
|
1497
|
+
await callback_func('STARTED')
|
|
1498
|
+
|
|
1499
|
+
|
|
1500
|
+
@_init_db_async
|
|
1501
|
+
async def get_job_status_with_task_id_async(
|
|
1502
|
+
job_id: int, task_id: int) -> Optional[ManagedJobStatus]:
|
|
1503
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1504
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1505
|
+
result = await session.execute(
|
|
1506
|
+
sqlalchemy.select(spot_table.c.status).where(
|
|
1507
|
+
sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
|
|
1508
|
+
spot_table.c.task_id == task_id)))
|
|
1509
|
+
status = result.fetchone()
|
|
1510
|
+
return ManagedJobStatus(status[0]) if status else None
|
|
1511
|
+
|
|
1512
|
+
|
|
1513
|
+
@_init_db_async
|
|
1514
|
+
async def set_recovering_async(job_id: int, task_id: int,
|
|
1515
|
+
force_transit_to_recovering: bool,
|
|
1516
|
+
callback_func: AsyncCallbackType):
|
|
1517
|
+
"""Set the task to recovering state, and update the job duration."""
|
|
1518
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1519
|
+
logger.info('=== Recovering... ===')
|
|
1520
|
+
current_time = time.time()
|
|
1521
|
+
|
|
1522
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1523
|
+
if force_transit_to_recovering:
|
|
1524
|
+
status_condition = spot_table.c.status.in_(
|
|
1525
|
+
[s.value for s in ManagedJobStatus.processing_statuses()])
|
|
1526
|
+
else:
|
|
1527
|
+
status_condition = (
|
|
1528
|
+
spot_table.c.status == ManagedJobStatus.RUNNING.value)
|
|
1529
|
+
|
|
1530
|
+
result = await session.execute(
|
|
1531
|
+
sqlalchemy.update(spot_table).where(
|
|
1532
|
+
sqlalchemy.and_(
|
|
1533
|
+
spot_table.c.spot_job_id == job_id,
|
|
1534
|
+
spot_table.c.task_id == task_id,
|
|
1535
|
+
status_condition,
|
|
1536
|
+
spot_table.c.end_at.is_(None),
|
|
1537
|
+
)).values({
|
|
1538
|
+
spot_table.c.status: ManagedJobStatus.RECOVERING.value,
|
|
1539
|
+
spot_table.c.job_duration: sqlalchemy.case(
|
|
1540
|
+
(spot_table.c.last_recovered_at >= 0,
|
|
1541
|
+
spot_table.c.job_duration + current_time -
|
|
1542
|
+
spot_table.c.last_recovered_at),
|
|
1543
|
+
else_=spot_table.c.job_duration),
|
|
1544
|
+
spot_table.c.last_recovered_at: sqlalchemy.case(
|
|
1545
|
+
(spot_table.c.last_recovered_at < 0, current_time),
|
|
1546
|
+
else_=spot_table.c.last_recovered_at),
|
|
1547
|
+
}))
|
|
1548
|
+
count = result.rowcount
|
|
1549
|
+
await session.commit()
|
|
1550
|
+
if count != 1:
|
|
1551
|
+
raise exceptions.ManagedJobStatusError(
|
|
1552
|
+
f'Failed to set the task to recovering. '
|
|
1553
|
+
f'({count} rows updated)')
|
|
1554
|
+
await callback_func('RECOVERING')
|
|
1555
|
+
|
|
1556
|
+
|
|
1557
|
+
@_init_db_async
|
|
1558
|
+
async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
|
|
1559
|
+
callback_func: AsyncCallbackType):
|
|
1560
|
+
"""Set the task to recovered."""
|
|
1561
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1562
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1563
|
+
result = await session.execute(
|
|
1564
|
+
sqlalchemy.update(spot_table).where(
|
|
1565
|
+
sqlalchemy.and_(
|
|
1566
|
+
spot_table.c.spot_job_id == job_id,
|
|
1567
|
+
spot_table.c.task_id == task_id,
|
|
1568
|
+
spot_table.c.status == ManagedJobStatus.RECOVERING.value,
|
|
1569
|
+
spot_table.c.end_at.is_(None),
|
|
1570
|
+
)).values({
|
|
1571
|
+
spot_table.c.status: ManagedJobStatus.RUNNING.value,
|
|
1572
|
+
spot_table.c.last_recovered_at: recovered_time,
|
|
1573
|
+
spot_table.c.recovery_count: spot_table.c.recovery_count +
|
|
1574
|
+
1,
|
|
1575
|
+
}))
|
|
1576
|
+
count = result.rowcount
|
|
1577
|
+
await session.commit()
|
|
1578
|
+
if count != 1:
|
|
1579
|
+
raise exceptions.ManagedJobStatusError(
|
|
1580
|
+
f'Failed to set the task to recovered. '
|
|
1581
|
+
f'({count} rows updated)')
|
|
1582
|
+
logger.info('==== Recovered. ====')
|
|
1583
|
+
await callback_func('RECOVERED')
|
|
1584
|
+
|
|
1585
|
+
|
|
1586
|
+
@_init_db_async
|
|
1587
|
+
async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
|
|
1588
|
+
callback_func: AsyncCallbackType):
|
|
1589
|
+
"""Set the task to succeeded, if it is in a non-terminal state."""
|
|
1590
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1591
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1592
|
+
result = await session.execute(
|
|
1593
|
+
sqlalchemy.update(spot_table).where(
|
|
1594
|
+
sqlalchemy.and_(
|
|
1595
|
+
spot_table.c.spot_job_id == job_id,
|
|
1596
|
+
spot_table.c.task_id == task_id,
|
|
1597
|
+
spot_table.c.status == ManagedJobStatus.RUNNING.value,
|
|
1598
|
+
spot_table.c.end_at.is_(None),
|
|
1599
|
+
)).values({
|
|
1600
|
+
spot_table.c.status: ManagedJobStatus.SUCCEEDED.value,
|
|
1601
|
+
spot_table.c.end_at: end_time,
|
|
1602
|
+
}))
|
|
1603
|
+
count = result.rowcount
|
|
1604
|
+
await session.commit()
|
|
1605
|
+
if count != 1:
|
|
1606
|
+
raise exceptions.ManagedJobStatusError(
|
|
1607
|
+
f'Failed to set the task to succeeded. '
|
|
1608
|
+
f'({count} rows updated)')
|
|
1609
|
+
await callback_func('SUCCEEDED')
|
|
1610
|
+
logger.info('Job succeeded.')
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
@_init_db_async
|
|
1614
|
+
async def set_failed_async(
|
|
1615
|
+
job_id: int,
|
|
1616
|
+
task_id: Optional[int],
|
|
1617
|
+
failure_type: ManagedJobStatus,
|
|
1618
|
+
failure_reason: str,
|
|
1619
|
+
callback_func: Optional[AsyncCallbackType] = None,
|
|
1620
|
+
end_time: Optional[float] = None,
|
|
1621
|
+
override_terminal: bool = False,
|
|
1622
|
+
):
|
|
1623
|
+
"""Set an entire job or task to failed."""
|
|
1624
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1625
|
+
assert failure_type.is_failed(), failure_type
|
|
1626
|
+
end_time = time.time() if end_time is None else end_time
|
|
1627
|
+
|
|
1628
|
+
fields_to_set: Dict[str, Any] = {
|
|
1629
|
+
spot_table.c.status: failure_type.value,
|
|
1630
|
+
spot_table.c.failure_reason: failure_reason,
|
|
1631
|
+
}
|
|
1632
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1633
|
+
# Get previous status
|
|
1634
|
+
result = await session.execute(
|
|
1635
|
+
sqlalchemy.select(
|
|
1636
|
+
spot_table.c.status).where(spot_table.c.spot_job_id == job_id))
|
|
1637
|
+
previous_status_row = result.fetchone()
|
|
1638
|
+
previous_status = ManagedJobStatus(previous_status_row[0])
|
|
1639
|
+
if previous_status == ManagedJobStatus.RECOVERING:
|
|
1640
|
+
fields_to_set[spot_table.c.last_recovered_at] = end_time
|
|
1641
|
+
where_conditions = [spot_table.c.spot_job_id == job_id]
|
|
1642
|
+
if task_id is not None:
|
|
1643
|
+
where_conditions.append(spot_table.c.task_id == task_id)
|
|
1644
|
+
if override_terminal:
|
|
1645
|
+
fields_to_set[spot_table.c.end_at] = sqlalchemy.func.coalesce(
|
|
1646
|
+
spot_table.c.end_at, end_time)
|
|
1647
|
+
else:
|
|
1648
|
+
fields_to_set[spot_table.c.end_at] = end_time
|
|
1649
|
+
where_conditions.append(spot_table.c.end_at.is_(None))
|
|
1650
|
+
result = await session.execute(
|
|
1651
|
+
sqlalchemy.update(spot_table).where(
|
|
1652
|
+
sqlalchemy.and_(*where_conditions)).values(fields_to_set))
|
|
1653
|
+
count = result.rowcount
|
|
1654
|
+
await session.commit()
|
|
1655
|
+
updated = count > 0
|
|
1656
|
+
if callback_func and updated:
|
|
1657
|
+
await callback_func('FAILED')
|
|
1658
|
+
logger.info(failure_reason)
|
|
1659
|
+
|
|
1660
|
+
|
|
1661
|
+
@_init_db_async
|
|
1662
|
+
async def set_cancelling_async(job_id: int, callback_func: AsyncCallbackType):
|
|
1663
|
+
"""Set tasks in the job as cancelling, if they are in non-terminal
|
|
1664
|
+
states."""
|
|
1665
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1666
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1667
|
+
result = await session.execute(
|
|
1668
|
+
sqlalchemy.update(spot_table).where(
|
|
1669
|
+
sqlalchemy.and_(
|
|
1670
|
+
spot_table.c.spot_job_id == job_id,
|
|
1671
|
+
spot_table.c.end_at.is_(None),
|
|
1672
|
+
)).values(
|
|
1673
|
+
{spot_table.c.status: ManagedJobStatus.CANCELLING.value}))
|
|
1674
|
+
count = result.rowcount
|
|
1675
|
+
await session.commit()
|
|
1676
|
+
updated = count > 0
|
|
1677
|
+
if updated:
|
|
1678
|
+
logger.info('Cancelling the job...')
|
|
1679
|
+
await callback_func('CANCELLING')
|
|
1680
|
+
else:
|
|
1681
|
+
logger.info('Cancellation skipped, job is already terminal')
|
|
1682
|
+
|
|
1683
|
+
|
|
1684
|
+
@_init_db_async
|
|
1685
|
+
async def set_cancelled_async(job_id: int, callback_func: AsyncCallbackType):
|
|
1686
|
+
"""Set tasks in the job as cancelled, if they are in CANCELLING state."""
|
|
1687
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1688
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1689
|
+
result = await session.execute(
|
|
1690
|
+
sqlalchemy.update(spot_table).where(
|
|
1691
|
+
sqlalchemy.and_(
|
|
1692
|
+
spot_table.c.spot_job_id == job_id,
|
|
1693
|
+
spot_table.c.status == ManagedJobStatus.CANCELLING.value,
|
|
1694
|
+
)).values({
|
|
1695
|
+
spot_table.c.status: ManagedJobStatus.CANCELLED.value,
|
|
1696
|
+
spot_table.c.end_at: time.time(),
|
|
1697
|
+
}))
|
|
1698
|
+
count = result.rowcount
|
|
1699
|
+
await session.commit()
|
|
1700
|
+
updated = count > 0
|
|
1701
|
+
if updated:
|
|
1702
|
+
logger.info('Job cancelled.')
|
|
1703
|
+
await callback_func('CANCELLED')
|
|
1704
|
+
else:
|
|
1705
|
+
logger.info('Cancellation skipped, job is not CANCELLING')
|
|
1706
|
+
|
|
1707
|
+
|
|
1708
|
+
@_init_db_async
|
|
1709
|
+
async def remove_ha_recovery_script_async(job_id: int) -> None:
|
|
1710
|
+
"""Remove the HA recovery script for a job."""
|
|
1711
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1712
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1713
|
+
await session.execute(
|
|
1714
|
+
sqlalchemy.delete(ha_recovery_script_table).where(
|
|
1715
|
+
ha_recovery_script_table.c.job_id == job_id))
|
|
1716
|
+
await session.commit()
|
|
1717
|
+
|
|
1718
|
+
|
|
1719
|
+
async def get_status_async(job_id: int) -> Optional[ManagedJobStatus]:
|
|
1720
|
+
_, status = await get_latest_task_id_status_async(job_id)
|
|
1721
|
+
return status
|
|
1722
|
+
|
|
1723
|
+
|
|
1724
|
+
@_init_db_async
|
|
1725
|
+
async def get_job_schedule_state_async(job_id: int) -> ManagedJobScheduleState:
|
|
1726
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1727
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1728
|
+
result = await session.execute(
|
|
1729
|
+
sqlalchemy.select(job_info_table.c.schedule_state).where(
|
|
1730
|
+
job_info_table.c.spot_job_id == job_id))
|
|
1731
|
+
state = result.fetchone()[0]
|
|
1732
|
+
return ManagedJobScheduleState(state)
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
@_init_db_async
|
|
1736
|
+
async def scheduler_set_done_async(job_id: int,
|
|
1737
|
+
idempotent: bool = False) -> None:
|
|
1738
|
+
"""Do not call without holding the scheduler lock."""
|
|
1739
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1740
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
1741
|
+
result = await session.execute(
|
|
1742
|
+
sqlalchemy.update(job_info_table).where(
|
|
1743
|
+
sqlalchemy.and_(
|
|
1744
|
+
job_info_table.c.spot_job_id == job_id,
|
|
1745
|
+
job_info_table.c.schedule_state !=
|
|
1746
|
+
ManagedJobScheduleState.DONE.value,
|
|
1747
|
+
)).values({
|
|
1748
|
+
job_info_table.c.schedule_state:
|
|
1749
|
+
ManagedJobScheduleState.DONE.value
|
|
1750
|
+
}))
|
|
1751
|
+
updated_count = result.rowcount
|
|
1752
|
+
await session.commit()
|
|
1753
|
+
if not idempotent:
|
|
1754
|
+
assert updated_count == 1, (job_id, updated_count)
|
|
1755
|
+
|
|
1756
|
+
|
|
1757
|
+
# ==== needed for codegen ====
|
|
1758
|
+
# functions have no use outside of codegen, remove at your own peril
|
|
1759
|
+
|
|
1760
|
+
|
|
1761
|
+
@_init_db
|
|
1762
|
+
def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
|
|
1763
|
+
pool: Optional[str], pool_hash: Optional[str]):
|
|
1764
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1765
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1766
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
1767
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
|
1768
|
+
insert_func = sqlite.insert
|
|
1769
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
|
1770
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
1771
|
+
insert_func = postgresql.insert
|
|
1772
|
+
else:
|
|
1773
|
+
raise ValueError('Unsupported database dialect')
|
|
1774
|
+
insert_stmt = insert_func(job_info_table).values(
|
|
1775
|
+
spot_job_id=job_id,
|
|
1776
|
+
name=name,
|
|
1777
|
+
schedule_state=ManagedJobScheduleState.INACTIVE.value,
|
|
1778
|
+
workspace=workspace,
|
|
1779
|
+
entrypoint=entrypoint,
|
|
1780
|
+
pool=pool,
|
|
1781
|
+
pool_hash=pool_hash,
|
|
1782
|
+
)
|
|
1783
|
+
session.execute(insert_stmt)
|
|
1784
|
+
session.commit()
|
|
1785
|
+
|
|
1786
|
+
|
|
1787
|
+
@_init_db
|
|
1788
|
+
def reset_jobs_for_recovery() -> None:
|
|
1789
|
+
"""Remove controller PIDs for live jobs, allowing them to be recovered."""
|
|
1790
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1791
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1792
|
+
session.query(job_info_table).filter(
|
|
1793
|
+
# PID should be set.
|
|
1794
|
+
job_info_table.c.controller_pid.isnot(None),
|
|
1795
|
+
# Schedule state should be alive.
|
|
1796
|
+
job_info_table.c.schedule_state.isnot(None),
|
|
1797
|
+
(job_info_table.c.schedule_state !=
|
|
1798
|
+
ManagedJobScheduleState.INVALID.value),
|
|
1799
|
+
(job_info_table.c.schedule_state !=
|
|
1800
|
+
ManagedJobScheduleState.WAITING.value),
|
|
1801
|
+
(job_info_table.c.schedule_state !=
|
|
1802
|
+
ManagedJobScheduleState.DONE.value),
|
|
1803
|
+
).update({
|
|
1804
|
+
job_info_table.c.controller_pid: None,
|
|
1805
|
+
job_info_table.c.schedule_state:
|
|
1806
|
+
(ManagedJobScheduleState.WAITING.value)
|
|
1807
|
+
})
|
|
1808
|
+
session.commit()
|
|
1809
|
+
|
|
1810
|
+
|
|
1811
|
+
@_init_db
|
|
1812
|
+
def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
|
|
1813
|
+
"""Get all job ids by name."""
|
|
1814
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1815
|
+
|
|
1816
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1817
|
+
query = sqlalchemy.select(
|
|
1818
|
+
spot_table.c.spot_job_id.distinct()).select_from(
|
|
1819
|
+
spot_table.outerjoin(
|
|
1820
|
+
job_info_table,
|
|
1821
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1822
|
+
if name is not None:
|
|
1823
|
+
# We match the job name from `job_info` for the jobs submitted after
|
|
1824
|
+
# #1982, and from `spot` for the jobs submitted before #1982, whose
|
|
1825
|
+
# job_info is not available.
|
|
1826
|
+
name_condition = sqlalchemy.or_(
|
|
1827
|
+
job_info_table.c.name == name,
|
|
1828
|
+
sqlalchemy.and_(job_info_table.c.name.is_(None),
|
|
1829
|
+
spot_table.c.task_name == name))
|
|
1830
|
+
query = query.where(name_condition)
|
|
1831
|
+
query = query.order_by(spot_table.c.spot_job_id.desc())
|
|
1832
|
+
rows = session.execute(query).fetchall()
|
|
1833
|
+
job_ids = [row[0] for row in rows if row[0] is not None]
|
|
1834
|
+
return job_ids
|