skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
# that we can easily switch to a s3-based storage.
|
|
4
4
|
import asyncio
|
|
5
5
|
import collections
|
|
6
|
+
import datetime
|
|
6
7
|
import enum
|
|
7
8
|
import functools
|
|
8
9
|
import ipaddress
|
|
@@ -11,7 +12,8 @@ import sqlite3
|
|
|
11
12
|
import threading
|
|
12
13
|
import time
|
|
13
14
|
import typing
|
|
14
|
-
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple,
|
|
15
|
+
from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
|
|
16
|
+
Union)
|
|
15
17
|
import urllib.parse
|
|
16
18
|
|
|
17
19
|
import colorama
|
|
@@ -24,6 +26,7 @@ from sqlalchemy.ext import asyncio as sql_async
|
|
|
24
26
|
from sqlalchemy.ext import declarative
|
|
25
27
|
|
|
26
28
|
from sky import exceptions
|
|
29
|
+
from sky import resources as resources_lib
|
|
27
30
|
from sky import sky_logging
|
|
28
31
|
from sky import skypilot_config
|
|
29
32
|
from sky.adaptors import common as adaptors_common
|
|
@@ -32,6 +35,7 @@ from sky.utils import common_utils
|
|
|
32
35
|
from sky.utils import context_utils
|
|
33
36
|
from sky.utils.db import db_utils
|
|
34
37
|
from sky.utils.db import migration_utils
|
|
38
|
+
from sky.utils.plugin_extensions import ExternalClusterFailure
|
|
35
39
|
|
|
36
40
|
if typing.TYPE_CHECKING:
|
|
37
41
|
from sqlalchemy.engine import row
|
|
@@ -54,6 +58,11 @@ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
|
|
|
54
58
|
|
|
55
59
|
_DB_RETRY_TIMES = 30
|
|
56
60
|
|
|
61
|
+
# 30 days retention for job events
|
|
62
|
+
DEFAULT_JOB_EVENT_RETENTION_HOURS = 30 * 24.0
|
|
63
|
+
# Run the job event retention daemon every hour
|
|
64
|
+
JOB_EVENT_DAEMON_INTERVAL_SECONDS = 3600
|
|
65
|
+
|
|
57
66
|
Base = declarative.declarative_base()
|
|
58
67
|
|
|
59
68
|
# === Database schema ===
|
|
@@ -94,7 +103,9 @@ spot_table = sqlalchemy.Table(
|
|
|
94
103
|
sqlalchemy.Column('specs', sqlalchemy.Text),
|
|
95
104
|
sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
|
|
96
105
|
sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
|
|
106
|
+
sqlalchemy.Column('links', sqlalchemy.JSON, server_default=None),
|
|
97
107
|
sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
|
|
108
|
+
sqlalchemy.Column('full_resources', sqlalchemy.JSON, server_default=None),
|
|
98
109
|
)
|
|
99
110
|
|
|
100
111
|
job_info_table = sqlalchemy.Table(
|
|
@@ -151,6 +162,25 @@ ha_recovery_script_table = sqlalchemy.Table(
|
|
|
151
162
|
sqlalchemy.Column('script', sqlalchemy.Text),
|
|
152
163
|
)
|
|
153
164
|
|
|
165
|
+
job_events_table = sqlalchemy.Table(
|
|
166
|
+
'job_events',
|
|
167
|
+
Base.metadata,
|
|
168
|
+
sqlalchemy.Column('id',
|
|
169
|
+
sqlalchemy.Integer,
|
|
170
|
+
primary_key=True,
|
|
171
|
+
autoincrement=True),
|
|
172
|
+
# See comment above for explanation of the legacy spot_job_id and
|
|
173
|
+
# task_id columns.
|
|
174
|
+
sqlalchemy.Column('spot_job_id', sqlalchemy.Integer, index=True),
|
|
175
|
+
sqlalchemy.Column('task_id', sqlalchemy.Integer, index=True),
|
|
176
|
+
sqlalchemy.Column('new_status', sqlalchemy.Text),
|
|
177
|
+
sqlalchemy.Column('code', sqlalchemy.Text),
|
|
178
|
+
sqlalchemy.Column('reason', sqlalchemy.Text),
|
|
179
|
+
sqlalchemy.Column('timestamp',
|
|
180
|
+
sqlalchemy.DateTime(timezone=True),
|
|
181
|
+
index=True),
|
|
182
|
+
)
|
|
183
|
+
|
|
154
184
|
|
|
155
185
|
def create_table(engine: sqlalchemy.engine.Engine):
|
|
156
186
|
# Enable WAL mode to avoid locking issues.
|
|
@@ -352,6 +382,7 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
|
352
382
|
'specs': r.get('specs'),
|
|
353
383
|
'local_log_file': r.get('local_log_file'),
|
|
354
384
|
'metadata': r.get('metadata'),
|
|
385
|
+
'links': r.get('links'), # SQLAlchemy JSON type, already parsed
|
|
355
386
|
# columns from job_info table (some may be None for legacy jobs)
|
|
356
387
|
'_job_info_job_id': r.get(job_info_table.c.spot_job_id
|
|
357
388
|
), # ambiguous, use table.column
|
|
@@ -767,8 +798,10 @@ def set_pending(
|
|
|
767
798
|
metadata: str,
|
|
768
799
|
):
|
|
769
800
|
"""Set the task to pending state."""
|
|
770
|
-
|
|
801
|
+
add_job_event(job_id, task_id, ManagedJobStatus.PENDING,
|
|
802
|
+
'Job submitted to queue')
|
|
771
803
|
|
|
804
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
772
805
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
773
806
|
session.execute(
|
|
774
807
|
sqlalchemy.insert(spot_table).values(
|
|
@@ -789,6 +822,9 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
|
|
|
789
822
|
This should only be used to transition from STARTING or RECOVERING back to
|
|
790
823
|
PENDING.
|
|
791
824
|
"""
|
|
825
|
+
await add_job_event_async(job_id, task_id, ManagedJobStatus.PENDING,
|
|
826
|
+
'Job is in backoff')
|
|
827
|
+
|
|
792
828
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
793
829
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
794
830
|
result = await session.execute(
|
|
@@ -824,10 +860,13 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
|
|
|
824
860
|
after using set_backoff_pending to transition back to PENDING during
|
|
825
861
|
launch retry backoff.
|
|
826
862
|
"""
|
|
827
|
-
|
|
828
|
-
target_status = ManagedJobStatus.STARTING.value
|
|
863
|
+
target_status = ManagedJobStatus.STARTING
|
|
829
864
|
if recovering:
|
|
830
|
-
target_status = ManagedJobStatus.RECOVERING
|
|
865
|
+
target_status = ManagedJobStatus.RECOVERING
|
|
866
|
+
|
|
867
|
+
await add_job_event_async(job_id, task_id, target_status,
|
|
868
|
+
'Job is restarting')
|
|
869
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
831
870
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
832
871
|
result = await session.execute(
|
|
833
872
|
sqlalchemy.update(spot_table).where(
|
|
@@ -835,7 +874,7 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
|
|
|
835
874
|
spot_table.c.spot_job_id == job_id,
|
|
836
875
|
spot_table.c.task_id == task_id,
|
|
837
876
|
spot_table.c.end_at.is_(None),
|
|
838
|
-
)).values({spot_table.c.status: target_status}))
|
|
877
|
+
)).values({spot_table.c.status: target_status.value}))
|
|
839
878
|
count = result.rowcount
|
|
840
879
|
await session.commit()
|
|
841
880
|
logger.debug(f'back to {target_status}')
|
|
@@ -936,6 +975,8 @@ def set_pending_cancelled(job_id: int):
|
|
|
936
975
|
Returns:
|
|
937
976
|
True if the job was cancelled, False otherwise.
|
|
938
977
|
"""
|
|
978
|
+
add_job_event(job_id, None, ManagedJobStatus.CANCELLED,
|
|
979
|
+
'Job has been cancelled')
|
|
939
980
|
assert _SQLALCHEMY_ENGINE is not None
|
|
940
981
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
941
982
|
# Subquery to get the spot_job_ids that match the joined condition
|
|
@@ -1681,6 +1722,29 @@ def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
|
1681
1722
|
session.commit()
|
|
1682
1723
|
|
|
1683
1724
|
|
|
1725
|
+
@_init_db
|
|
1726
|
+
def update_job_full_resources(job_id: int,
|
|
1727
|
+
full_resources_json: Dict[str, Any]) -> None:
|
|
1728
|
+
"""Update the full_resources column for a job.
|
|
1729
|
+
|
|
1730
|
+
This is called after scheduling to set the specific resource that was
|
|
1731
|
+
selected from an any_of or ordered list. The update happens within the
|
|
1732
|
+
filelock in get_next_cluster_name to ensure atomicity.
|
|
1733
|
+
|
|
1734
|
+
Args:
|
|
1735
|
+
job_id: The spot_job_id to update
|
|
1736
|
+
full_resources_json: The resolved resource configuration (single
|
|
1737
|
+
resource, not any_of/ordered)
|
|
1738
|
+
"""
|
|
1739
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1740
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1741
|
+
session.execute(
|
|
1742
|
+
sqlalchemy.update(spot_table).where(
|
|
1743
|
+
spot_table.c.spot_job_id == job_id).values(
|
|
1744
|
+
{spot_table.c.full_resources: full_resources_json}))
|
|
1745
|
+
session.commit()
|
|
1746
|
+
|
|
1747
|
+
|
|
1684
1748
|
@_init_db_async
|
|
1685
1749
|
async def set_job_id_on_pool_cluster_async(job_id: int,
|
|
1686
1750
|
job_id_on_pool_cluster: int) -> None:
|
|
@@ -1857,6 +1921,83 @@ def get_nonterminal_job_ids_by_pool(pool: str,
|
|
|
1857
1921
|
return job_ids
|
|
1858
1922
|
|
|
1859
1923
|
|
|
1924
|
+
def _is_any_of_or_ordered(resource_config: Dict[str, Any]) -> bool:
|
|
1925
|
+
"""Check if resource config is heterogeneous (any_of or ordered).
|
|
1926
|
+
|
|
1927
|
+
Args:
|
|
1928
|
+
resource_config: Resource configuration dictionary
|
|
1929
|
+
|
|
1930
|
+
Returns:
|
|
1931
|
+
True if the config contains 'any_of' or 'ordered' keys, indicating
|
|
1932
|
+
heterogeneous resources that haven't been resolved to a specific
|
|
1933
|
+
resource yet.
|
|
1934
|
+
"""
|
|
1935
|
+
return 'any_of' in resource_config or 'ordered' in resource_config
|
|
1936
|
+
|
|
1937
|
+
|
|
1938
|
+
@_init_db
|
|
1939
|
+
def get_pool_worker_used_resources(
|
|
1940
|
+
job_ids: Set[int]) -> Optional['resources_lib.Resources']:
|
|
1941
|
+
"""Get the total used resources by running jobs.
|
|
1942
|
+
|
|
1943
|
+
Args:
|
|
1944
|
+
job_ids: Set of spot_job_id values to check
|
|
1945
|
+
|
|
1946
|
+
Returns:
|
|
1947
|
+
Resources object with summed resources from all running jobs, or None
|
|
1948
|
+
if we couldn't parse the resources string for any job.
|
|
1949
|
+
"""
|
|
1950
|
+
if not job_ids:
|
|
1951
|
+
return None
|
|
1952
|
+
|
|
1953
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1954
|
+
|
|
1955
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1956
|
+
# Query spot_table for full_resources. Use full_resources if available,
|
|
1957
|
+
# otherwise fall back to resources for backward compatibility.
|
|
1958
|
+
# Don't check for running status because we want to include jobs that
|
|
1959
|
+
# may have just been scheduled. The job_ids come from
|
|
1960
|
+
# get_nonterminal_job_ids_by_pool anyway so we don't need to worry
|
|
1961
|
+
# about removing old jobs.
|
|
1962
|
+
query = sqlalchemy.select(spot_table.c.full_resources).where(
|
|
1963
|
+
sqlalchemy.and_(spot_table.c.spot_job_id.in_(job_ids)))
|
|
1964
|
+
rows = session.execute(query).fetchall()
|
|
1965
|
+
|
|
1966
|
+
resource_configs = []
|
|
1967
|
+
for row in rows:
|
|
1968
|
+
if row[0] is None:
|
|
1969
|
+
# We don't have full_resources for this job. We should return
|
|
1970
|
+
# none since we can't make any guarantees about what resources
|
|
1971
|
+
# are being used.
|
|
1972
|
+
return None
|
|
1973
|
+
resource_configs.append(row[0])
|
|
1974
|
+
|
|
1975
|
+
# Parse resources dicts into Resources objects and sum them using +
|
|
1976
|
+
total_resources = None
|
|
1977
|
+
# full_resources is now stored as JSON dict from to_yaml_config()
|
|
1978
|
+
for resource_config in resource_configs:
|
|
1979
|
+
# Check if this is an unresolved heterogeneous config (any_of/ordered)
|
|
1980
|
+
if _is_any_of_or_ordered(resource_config):
|
|
1981
|
+
# Can't determine usage for heterogeneous unresolved configs.
|
|
1982
|
+
# Return None to fall back to non-resource-aware scheduling.
|
|
1983
|
+
return None
|
|
1984
|
+
|
|
1985
|
+
resources_set = resources_lib.Resources.from_yaml_config(
|
|
1986
|
+
resource_config)
|
|
1987
|
+
if len(resources_set) == 0:
|
|
1988
|
+
# We couldn't parse the resources JSON. We should return
|
|
1989
|
+
# none since we can't make any guarantees about what resources
|
|
1990
|
+
# are being used.
|
|
1991
|
+
return None
|
|
1992
|
+
# Get the first Resources object from the set/list
|
|
1993
|
+
parsed = next(iter(resources_set))
|
|
1994
|
+
if total_resources is None:
|
|
1995
|
+
total_resources = parsed
|
|
1996
|
+
else:
|
|
1997
|
+
total_resources = total_resources + parsed
|
|
1998
|
+
return total_resources
|
|
1999
|
+
|
|
2000
|
+
|
|
1860
2001
|
@_init_db_async
|
|
1861
2002
|
async def get_waiting_job_async(
|
|
1862
2003
|
pid: int, pid_started_at: float) -> Optional[Dict[str, Any]]:
|
|
@@ -1964,14 +2105,30 @@ async def get_latest_task_id_status_async(
|
|
|
1964
2105
|
|
|
1965
2106
|
|
|
1966
2107
|
@_init_db_async
|
|
1967
|
-
async def set_starting_async(job_id: int,
|
|
1968
|
-
|
|
2108
|
+
async def set_starting_async(job_id: int,
|
|
2109
|
+
task_id: int,
|
|
2110
|
+
run_timestamp: str,
|
|
2111
|
+
submit_time: float,
|
|
2112
|
+
resources_str: str,
|
|
1969
2113
|
specs: Dict[str, Union[str, int]],
|
|
1970
|
-
callback_func: AsyncCallbackType
|
|
2114
|
+
callback_func: AsyncCallbackType,
|
|
2115
|
+
full_resources_json: Optional[Dict[str,
|
|
2116
|
+
Any]] = None):
|
|
1971
2117
|
"""Set the task to starting state."""
|
|
2118
|
+
await add_job_event_async(job_id, task_id, ManagedJobStatus.STARTING,
|
|
2119
|
+
'Job is starting')
|
|
1972
2120
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
1973
2121
|
logger.info('Launching the spot cluster...')
|
|
1974
2122
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2123
|
+
values = {
|
|
2124
|
+
spot_table.c.resources: resources_str,
|
|
2125
|
+
spot_table.c.submitted_at: submit_time,
|
|
2126
|
+
spot_table.c.status: ManagedJobStatus.STARTING.value,
|
|
2127
|
+
spot_table.c.run_timestamp: run_timestamp,
|
|
2128
|
+
spot_table.c.specs: json.dumps(specs),
|
|
2129
|
+
}
|
|
2130
|
+
if full_resources_json is not None:
|
|
2131
|
+
values[spot_table.c.full_resources] = full_resources_json
|
|
1975
2132
|
result = await session.execute(
|
|
1976
2133
|
sqlalchemy.update(spot_table).where(
|
|
1977
2134
|
sqlalchemy.and_(
|
|
@@ -1979,13 +2136,7 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
|
|
|
1979
2136
|
spot_table.c.task_id == task_id,
|
|
1980
2137
|
spot_table.c.status == ManagedJobStatus.PENDING.value,
|
|
1981
2138
|
spot_table.c.end_at.is_(None),
|
|
1982
|
-
)).values(
|
|
1983
|
-
spot_table.c.resources: resources_str,
|
|
1984
|
-
spot_table.c.submitted_at: submit_time,
|
|
1985
|
-
spot_table.c.status: ManagedJobStatus.STARTING.value,
|
|
1986
|
-
spot_table.c.run_timestamp: run_timestamp,
|
|
1987
|
-
spot_table.c.specs: json.dumps(specs),
|
|
1988
|
-
}))
|
|
2139
|
+
)).values(values))
|
|
1989
2140
|
count = result.rowcount
|
|
1990
2141
|
await session.commit()
|
|
1991
2142
|
if count != 1:
|
|
@@ -2003,6 +2154,8 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
|
|
|
2003
2154
|
async def set_started_async(job_id: int, task_id: int, start_time: float,
|
|
2004
2155
|
callback_func: AsyncCallbackType):
|
|
2005
2156
|
"""Set the task to started state."""
|
|
2157
|
+
await add_job_event_async(job_id, task_id, ManagedJobStatus.RUNNING,
|
|
2158
|
+
'Job has started')
|
|
2006
2159
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2007
2160
|
logger.info('Job started.')
|
|
2008
2161
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
@@ -2047,10 +2200,23 @@ async def get_job_status_with_task_id_async(
|
|
|
2047
2200
|
|
|
2048
2201
|
|
|
2049
2202
|
@_init_db_async
|
|
2050
|
-
async def set_recovering_async(
|
|
2051
|
-
|
|
2052
|
-
|
|
2203
|
+
async def set_recovering_async(
|
|
2204
|
+
job_id: int,
|
|
2205
|
+
task_id: int,
|
|
2206
|
+
force_transit_to_recovering: bool,
|
|
2207
|
+
callback_func: AsyncCallbackType,
|
|
2208
|
+
external_failures: Optional[List[ExternalClusterFailure]] = None,
|
|
2209
|
+
):
|
|
2053
2210
|
"""Set the task to recovering state, and update the job duration."""
|
|
2211
|
+
# Build code and reason from external failures for the event log
|
|
2212
|
+
if external_failures:
|
|
2213
|
+
code = '; '.join(f.code for f in external_failures)
|
|
2214
|
+
reason = '; '.join(f.reason for f in external_failures)
|
|
2215
|
+
else:
|
|
2216
|
+
code = None
|
|
2217
|
+
reason = 'Cluster preempted or failed, recovering'
|
|
2218
|
+
await add_job_event_async(job_id, task_id, ManagedJobStatus.RECOVERING,
|
|
2219
|
+
reason, code)
|
|
2054
2220
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2055
2221
|
logger.info('=== Recovering... ===')
|
|
2056
2222
|
current_time = time.time()
|
|
@@ -2099,6 +2265,8 @@ async def set_recovering_async(job_id: int, task_id: int,
|
|
|
2099
2265
|
async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
|
|
2100
2266
|
callback_func: AsyncCallbackType):
|
|
2101
2267
|
"""Set the task to recovered."""
|
|
2268
|
+
await add_job_event_async(job_id, task_id, ManagedJobStatus.RUNNING,
|
|
2269
|
+
'Job has recovered')
|
|
2102
2270
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2103
2271
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2104
2272
|
result = await session.execute(
|
|
@@ -2131,6 +2299,8 @@ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
|
|
|
2131
2299
|
async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
|
|
2132
2300
|
callback_func: AsyncCallbackType):
|
|
2133
2301
|
"""Set the task to succeeded, if it is in a non-terminal state."""
|
|
2302
|
+
await add_job_event_async(job_id, task_id, ManagedJobStatus.SUCCEEDED,
|
|
2303
|
+
'Job has succeeded')
|
|
2134
2304
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2135
2305
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2136
2306
|
result = await session.execute(
|
|
@@ -2168,6 +2338,8 @@ async def set_failed_async(
|
|
|
2168
2338
|
override_terminal: bool = False,
|
|
2169
2339
|
):
|
|
2170
2340
|
"""Set an entire job or task to failed."""
|
|
2341
|
+
await add_job_event_async(job_id, task_id, failure_type,
|
|
2342
|
+
f'Job failed: {failure_reason}')
|
|
2171
2343
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2172
2344
|
assert failure_type.is_failed(), failure_type
|
|
2173
2345
|
end_time = time.time() if end_time is None else end_time
|
|
@@ -2217,10 +2389,59 @@ async def set_failed_async(
|
|
|
2217
2389
|
logger.info(failure_reason)
|
|
2218
2390
|
|
|
2219
2391
|
|
|
2392
|
+
@_init_db_async
|
|
2393
|
+
async def update_links_async(job_id: int, task_id: int,
|
|
2394
|
+
links: Dict[str, str]) -> None:
|
|
2395
|
+
"""Update the links for a managed job task.
|
|
2396
|
+
|
|
2397
|
+
Links are stored as JSON in the database. SQLAlchemy handles
|
|
2398
|
+
serialization/deserialization automatically.
|
|
2399
|
+
|
|
2400
|
+
Uses a transaction to ensure atomicity. For PostgreSQL, we use row-level
|
|
2401
|
+
locking (SELECT FOR UPDATE). For SQLite, row-level locking is not
|
|
2402
|
+
supported, so we rely on SQLite's database-level write locking which
|
|
2403
|
+
provides serializable isolation for write transactions.
|
|
2404
|
+
"""
|
|
2405
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2406
|
+
logger.info(f'Updating external links with: {links}')
|
|
2407
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2408
|
+
async with session.begin():
|
|
2409
|
+
# Build the select query
|
|
2410
|
+
select_query = sqlalchemy.select(spot_table.c.links).where(
|
|
2411
|
+
sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
|
|
2412
|
+
spot_table.c.task_id == task_id))
|
|
2413
|
+
|
|
2414
|
+
# Use row-level locking for PostgreSQL; SQLite doesn't support
|
|
2415
|
+
# SELECT FOR UPDATE but provides database-level write locking
|
|
2416
|
+
if (_SQLALCHEMY_ENGINE_ASYNC.dialect.name ==
|
|
2417
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
|
2418
|
+
select_query = select_query.with_for_update()
|
|
2419
|
+
|
|
2420
|
+
result = await session.execute(select_query)
|
|
2421
|
+
existing_links_row = result.fetchone()
|
|
2422
|
+
existing_links = {}
|
|
2423
|
+
if existing_links_row and existing_links_row[0]:
|
|
2424
|
+
existing_links = existing_links_row[0]
|
|
2425
|
+
|
|
2426
|
+
# Merge new links into existing
|
|
2427
|
+
existing_links.update(links)
|
|
2428
|
+
|
|
2429
|
+
# Update the database (SQLAlchemy JSON type handles serialization)
|
|
2430
|
+
await session.execute(
|
|
2431
|
+
sqlalchemy.update(spot_table).where(
|
|
2432
|
+
sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
|
|
2433
|
+
spot_table.c.task_id == task_id)).values({
|
|
2434
|
+
spot_table.c.links: existing_links,
|
|
2435
|
+
}))
|
|
2436
|
+
# Transaction commits automatically when exiting the context
|
|
2437
|
+
|
|
2438
|
+
|
|
2220
2439
|
@_init_db_async
|
|
2221
2440
|
async def set_cancelling_async(job_id: int, callback_func: AsyncCallbackType):
|
|
2222
2441
|
"""Set tasks in the job as cancelling, if they are in non-terminal
|
|
2223
2442
|
states."""
|
|
2443
|
+
await add_job_event_async(job_id, None, ManagedJobStatus.CANCELLING,
|
|
2444
|
+
'Job is cancelling')
|
|
2224
2445
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2225
2446
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2226
2447
|
result = await session.execute(
|
|
@@ -2243,6 +2464,8 @@ async def set_cancelling_async(job_id: int, callback_func: AsyncCallbackType):
|
|
|
2243
2464
|
@_init_db_async
|
|
2244
2465
|
async def set_cancelled_async(job_id: int, callback_func: AsyncCallbackType):
|
|
2245
2466
|
"""Set tasks in the job as cancelled, if they are in CANCELLING state."""
|
|
2467
|
+
await add_job_event_async(job_id, None, ManagedJobStatus.CANCELLED,
|
|
2468
|
+
'Job has been cancelled')
|
|
2246
2469
|
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2247
2470
|
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2248
2471
|
result = await session.execute(
|
|
@@ -2519,3 +2742,178 @@ def set_controller_logs_cleaned(job_ids: List[int], logs_cleaned_at: float):
|
|
|
2519
2742
|
job_info_table.c.spot_job_id.in_(job_ids)).values(
|
|
2520
2743
|
controller_logs_cleaned_at=logs_cleaned_at))
|
|
2521
2744
|
session.commit()
|
|
2745
|
+
|
|
2746
|
+
|
|
2747
|
+
@_init_db
|
|
2748
|
+
def add_job_event(job_id: int,
|
|
2749
|
+
task_id: Optional[int],
|
|
2750
|
+
new_status: ManagedJobStatus,
|
|
2751
|
+
reason: str,
|
|
2752
|
+
timestamp: Optional[datetime.datetime] = None) -> None:
|
|
2753
|
+
"""Add a job event record to the audit log.
|
|
2754
|
+
|
|
2755
|
+
Args:
|
|
2756
|
+
job_id: The spot_job_id of the managed job.
|
|
2757
|
+
task_id: The task_id within the managed job. If None, adds a
|
|
2758
|
+
job-level event that applies to all tasks.
|
|
2759
|
+
new_status: The new status being transitioned to. Can be a
|
|
2760
|
+
ManagedJobStatus enum.
|
|
2761
|
+
reason: A description of why the event occurred.
|
|
2762
|
+
timestamp: The timestamp of the event. If None, uses current time.
|
|
2763
|
+
"""
|
|
2764
|
+
if timestamp is None:
|
|
2765
|
+
timestamp = datetime.datetime.now()
|
|
2766
|
+
|
|
2767
|
+
status_value = new_status.value
|
|
2768
|
+
|
|
2769
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2770
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2771
|
+
session.execute(job_events_table.insert().values(
|
|
2772
|
+
spot_job_id=job_id,
|
|
2773
|
+
task_id=task_id, # Can be None for job-level events
|
|
2774
|
+
new_status=status_value,
|
|
2775
|
+
reason=reason,
|
|
2776
|
+
timestamp=timestamp,
|
|
2777
|
+
))
|
|
2778
|
+
session.commit()
|
|
2779
|
+
|
|
2780
|
+
|
|
2781
|
+
async def _get_all_task_ids_async(job_id: int) -> List[int]:
|
|
2782
|
+
"""Get all task IDs for a job (async version)."""
|
|
2783
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2784
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2785
|
+
result = await session.execute(
|
|
2786
|
+
sqlalchemy.select(spot_table.c.task_id).where(
|
|
2787
|
+
spot_table.c.spot_job_id == job_id).order_by(
|
|
2788
|
+
spot_table.c.task_id.asc()))
|
|
2789
|
+
return [row[0] for row in result.fetchall()]
|
|
2790
|
+
|
|
2791
|
+
|
|
2792
|
+
@_init_db_async
|
|
2793
|
+
async def add_job_event_async(
|
|
2794
|
+
job_id: int,
|
|
2795
|
+
task_id: Optional[int],
|
|
2796
|
+
new_status: ManagedJobStatus,
|
|
2797
|
+
reason: str,
|
|
2798
|
+
code: Optional[str] = None,
|
|
2799
|
+
timestamp: Optional[datetime.datetime] = None) -> None:
|
|
2800
|
+
"""Add a job event record to the audit log (async version).
|
|
2801
|
+
|
|
2802
|
+
Args:
|
|
2803
|
+
job_id: The spot_job_id of the managed job.
|
|
2804
|
+
task_id: The task_id within the managed job. If None, adds a
|
|
2805
|
+
job-level event that applies to all tasks.
|
|
2806
|
+
new_status: The new status being transitioned to. Can be a
|
|
2807
|
+
ManagedJobStatus enum.
|
|
2808
|
+
reason: A description of why the event occurred.
|
|
2809
|
+
code: Optional error category code for failures.
|
|
2810
|
+
timestamp: The timestamp of the event. If None, uses current time.
|
|
2811
|
+
"""
|
|
2812
|
+
if timestamp is None:
|
|
2813
|
+
timestamp = datetime.datetime.now()
|
|
2814
|
+
|
|
2815
|
+
status_value = new_status.value
|
|
2816
|
+
|
|
2817
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2818
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2819
|
+
await session.execute(job_events_table.insert().values(
|
|
2820
|
+
spot_job_id=job_id,
|
|
2821
|
+
task_id=task_id, # Can be None for job-level events
|
|
2822
|
+
new_status=status_value,
|
|
2823
|
+
code=code,
|
|
2824
|
+
reason=reason,
|
|
2825
|
+
timestamp=timestamp,
|
|
2826
|
+
))
|
|
2827
|
+
await session.commit()
|
|
2828
|
+
|
|
2829
|
+
|
|
2830
|
+
@_init_db
|
|
2831
|
+
def get_job_events(job_id: int,
|
|
2832
|
+
task_id: Optional[int] = None,
|
|
2833
|
+
limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
2834
|
+
"""Get task events for a managed job.
|
|
2835
|
+
|
|
2836
|
+
Args:
|
|
2837
|
+
job_id: The spot_job_id of the managed job.
|
|
2838
|
+
task_id: Optional task_id to filter by. If None, returns events
|
|
2839
|
+
for all tasks. If specified, returns events for that task plus
|
|
2840
|
+
job-level events (where task_id is None).
|
|
2841
|
+
limit: Optional limit on number of events to return. If specified,
|
|
2842
|
+
returns the most recent N events.
|
|
2843
|
+
|
|
2844
|
+
Returns:
|
|
2845
|
+
List of event records, ordered by timestamp descending
|
|
2846
|
+
(most recent first) if limit is specified, otherwise ascending.
|
|
2847
|
+
"""
|
|
2848
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
2849
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
2850
|
+
query = sqlalchemy.select(
|
|
2851
|
+
job_events_table.c.spot_job_id,
|
|
2852
|
+
job_events_table.c.task_id,
|
|
2853
|
+
job_events_table.c.new_status,
|
|
2854
|
+
job_events_table.c.code,
|
|
2855
|
+
job_events_table.c.reason,
|
|
2856
|
+
job_events_table.c.timestamp,
|
|
2857
|
+
).where(job_events_table.c.spot_job_id == job_id)
|
|
2858
|
+
|
|
2859
|
+
if task_id is not None:
|
|
2860
|
+
# Include events for the specific task AND job-level events
|
|
2861
|
+
# (task_id is None)
|
|
2862
|
+
query = query.where(
|
|
2863
|
+
sqlalchemy.or_(job_events_table.c.task_id == task_id,
|
|
2864
|
+
job_events_table.c.task_id.is_(None)))
|
|
2865
|
+
|
|
2866
|
+
# Order by timestamp descending to get most recent first
|
|
2867
|
+
query = query.order_by(job_events_table.c.timestamp.desc())
|
|
2868
|
+
|
|
2869
|
+
if limit is not None:
|
|
2870
|
+
query = query.limit(limit)
|
|
2871
|
+
|
|
2872
|
+
rows = session.execute(query).fetchall()
|
|
2873
|
+
return [{
|
|
2874
|
+
'spot_job_id': row[0],
|
|
2875
|
+
'task_id': row[1],
|
|
2876
|
+
'new_status': ManagedJobStatus(row[2]),
|
|
2877
|
+
'code': row[3],
|
|
2878
|
+
'reason': row[4],
|
|
2879
|
+
'timestamp': row[5],
|
|
2880
|
+
} for row in rows]
|
|
2881
|
+
|
|
2882
|
+
|
|
2883
|
+
@_init_db_async
|
|
2884
|
+
async def cleanup_job_events_with_retention_async(
|
|
2885
|
+
retention_hours: float) -> None:
|
|
2886
|
+
"""Delete job events older than the retention period.
|
|
2887
|
+
|
|
2888
|
+
Args:
|
|
2889
|
+
retention_hours: Number of hours to retain job events.
|
|
2890
|
+
"""
|
|
2891
|
+
assert _SQLALCHEMY_ENGINE_ASYNC is not None
|
|
2892
|
+
cutoff_time = datetime.datetime.now() - datetime.timedelta(
|
|
2893
|
+
hours=retention_hours)
|
|
2894
|
+
|
|
2895
|
+
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
|
|
2896
|
+
result = await session.execute(
|
|
2897
|
+
sqlalchemy.delete(job_events_table).where(
|
|
2898
|
+
job_events_table.c.timestamp < cutoff_time))
|
|
2899
|
+
count = result.rowcount
|
|
2900
|
+
if count > 0:
|
|
2901
|
+
logger.debug(f'Deleted {count} job events older than '
|
|
2902
|
+
f'{retention_hours} hours.')
|
|
2903
|
+
await session.commit()
|
|
2904
|
+
|
|
2905
|
+
|
|
2906
|
+
async def job_event_retention_daemon():
|
|
2907
|
+
"""Garbage collect job events periodically."""
|
|
2908
|
+
while True:
|
|
2909
|
+
logger.info('Running job event retention daemon...')
|
|
2910
|
+
try:
|
|
2911
|
+
await cleanup_job_events_with_retention_async(
|
|
2912
|
+
DEFAULT_JOB_EVENT_RETENTION_HOURS)
|
|
2913
|
+
except asyncio.CancelledError:
|
|
2914
|
+
logger.info('Job event retention daemon cancelled')
|
|
2915
|
+
break
|
|
2916
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2917
|
+
logger.error(f'Error running job event retention daemon: {e}')
|
|
2918
|
+
|
|
2919
|
+
await asyncio.sleep(JOB_EVENT_DAEMON_INTERVAL_SECONDS)
|