skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -3,6 +3,7 @@
3
3
  # that we can easily switch to a s3-based storage.
4
4
  import asyncio
5
5
  import collections
6
+ import datetime
6
7
  import enum
7
8
  import functools
8
9
  import ipaddress
@@ -11,7 +12,8 @@ import sqlite3
11
12
  import threading
12
13
  import time
13
14
  import typing
14
- from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
15
+ from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
16
+ Union)
15
17
  import urllib.parse
16
18
 
17
19
  import colorama
@@ -24,6 +26,7 @@ from sqlalchemy.ext import asyncio as sql_async
24
26
  from sqlalchemy.ext import declarative
25
27
 
26
28
  from sky import exceptions
29
+ from sky import resources as resources_lib
27
30
  from sky import sky_logging
28
31
  from sky import skypilot_config
29
32
  from sky.adaptors import common as adaptors_common
@@ -32,6 +35,7 @@ from sky.utils import common_utils
32
35
  from sky.utils import context_utils
33
36
  from sky.utils.db import db_utils
34
37
  from sky.utils.db import migration_utils
38
+ from sky.utils.plugin_extensions import ExternalClusterFailure
35
39
 
36
40
  if typing.TYPE_CHECKING:
37
41
  from sqlalchemy.engine import row
@@ -54,6 +58,11 @@ _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
54
58
 
55
59
  _DB_RETRY_TIMES = 30
56
60
 
61
+ # 30 days retention for job events
62
+ DEFAULT_JOB_EVENT_RETENTION_HOURS = 30 * 24.0
63
+ # Run the job event retention daemon every hour
64
+ JOB_EVENT_DAEMON_INTERVAL_SECONDS = 3600
65
+
57
66
  Base = declarative.declarative_base()
58
67
 
59
68
  # === Database schema ===
@@ -94,7 +103,9 @@ spot_table = sqlalchemy.Table(
94
103
  sqlalchemy.Column('specs', sqlalchemy.Text),
95
104
  sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
96
105
  sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
106
+ sqlalchemy.Column('links', sqlalchemy.JSON, server_default=None),
97
107
  sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
108
+ sqlalchemy.Column('full_resources', sqlalchemy.JSON, server_default=None),
98
109
  )
99
110
 
100
111
  job_info_table = sqlalchemy.Table(
@@ -151,6 +162,25 @@ ha_recovery_script_table = sqlalchemy.Table(
151
162
  sqlalchemy.Column('script', sqlalchemy.Text),
152
163
  )
153
164
 
165
+ job_events_table = sqlalchemy.Table(
166
+ 'job_events',
167
+ Base.metadata,
168
+ sqlalchemy.Column('id',
169
+ sqlalchemy.Integer,
170
+ primary_key=True,
171
+ autoincrement=True),
172
+ # See comment above for explanation of the legacy spot_job_id and
173
+ # task_id columns.
174
+ sqlalchemy.Column('spot_job_id', sqlalchemy.Integer, index=True),
175
+ sqlalchemy.Column('task_id', sqlalchemy.Integer, index=True),
176
+ sqlalchemy.Column('new_status', sqlalchemy.Text),
177
+ sqlalchemy.Column('code', sqlalchemy.Text),
178
+ sqlalchemy.Column('reason', sqlalchemy.Text),
179
+ sqlalchemy.Column('timestamp',
180
+ sqlalchemy.DateTime(timezone=True),
181
+ index=True),
182
+ )
183
+
154
184
 
155
185
  def create_table(engine: sqlalchemy.engine.Engine):
156
186
  # Enable WAL mode to avoid locking issues.
@@ -352,6 +382,7 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
352
382
  'specs': r.get('specs'),
353
383
  'local_log_file': r.get('local_log_file'),
354
384
  'metadata': r.get('metadata'),
385
+ 'links': r.get('links'), # SQLAlchemy JSON type, already parsed
355
386
  # columns from job_info table (some may be None for legacy jobs)
356
387
  '_job_info_job_id': r.get(job_info_table.c.spot_job_id
357
388
  ), # ambiguous, use table.column
@@ -767,8 +798,10 @@ def set_pending(
767
798
  metadata: str,
768
799
  ):
769
800
  """Set the task to pending state."""
770
- assert _SQLALCHEMY_ENGINE is not None
801
+ add_job_event(job_id, task_id, ManagedJobStatus.PENDING,
802
+ 'Job submitted to queue')
771
803
 
804
+ assert _SQLALCHEMY_ENGINE is not None
772
805
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
773
806
  session.execute(
774
807
  sqlalchemy.insert(spot_table).values(
@@ -789,6 +822,9 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
789
822
  This should only be used to transition from STARTING or RECOVERING back to
790
823
  PENDING.
791
824
  """
825
+ await add_job_event_async(job_id, task_id, ManagedJobStatus.PENDING,
826
+ 'Job is in backoff')
827
+
792
828
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
793
829
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
794
830
  result = await session.execute(
@@ -824,10 +860,13 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
824
860
  after using set_backoff_pending to transition back to PENDING during
825
861
  launch retry backoff.
826
862
  """
827
- assert _SQLALCHEMY_ENGINE_ASYNC is not None
828
- target_status = ManagedJobStatus.STARTING.value
863
+ target_status = ManagedJobStatus.STARTING
829
864
  if recovering:
830
- target_status = ManagedJobStatus.RECOVERING.value
865
+ target_status = ManagedJobStatus.RECOVERING
866
+
867
+ await add_job_event_async(job_id, task_id, target_status,
868
+ 'Job is restarting')
869
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
831
870
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
832
871
  result = await session.execute(
833
872
  sqlalchemy.update(spot_table).where(
@@ -835,7 +874,7 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
835
874
  spot_table.c.spot_job_id == job_id,
836
875
  spot_table.c.task_id == task_id,
837
876
  spot_table.c.end_at.is_(None),
838
- )).values({spot_table.c.status: target_status}))
877
+ )).values({spot_table.c.status: target_status.value}))
839
878
  count = result.rowcount
840
879
  await session.commit()
841
880
  logger.debug(f'back to {target_status}')
@@ -936,6 +975,8 @@ def set_pending_cancelled(job_id: int):
936
975
  Returns:
937
976
  True if the job was cancelled, False otherwise.
938
977
  """
978
+ add_job_event(job_id, None, ManagedJobStatus.CANCELLED,
979
+ 'Job has been cancelled')
939
980
  assert _SQLALCHEMY_ENGINE is not None
940
981
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
941
982
  # Subquery to get the spot_job_ids that match the joined condition
@@ -1681,6 +1722,29 @@ def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
1681
1722
  session.commit()
1682
1723
 
1683
1724
 
1725
+ @_init_db
1726
+ def update_job_full_resources(job_id: int,
1727
+ full_resources_json: Dict[str, Any]) -> None:
1728
+ """Update the full_resources column for a job.
1729
+
1730
+ This is called after scheduling to set the specific resource that was
1731
+ selected from an any_of or ordered list. The update happens within the
1732
+ filelock in get_next_cluster_name to ensure atomicity.
1733
+
1734
+ Args:
1735
+ job_id: The spot_job_id to update
1736
+ full_resources_json: The resolved resource configuration (single
1737
+ resource, not any_of/ordered)
1738
+ """
1739
+ assert _SQLALCHEMY_ENGINE is not None
1740
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1741
+ session.execute(
1742
+ sqlalchemy.update(spot_table).where(
1743
+ spot_table.c.spot_job_id == job_id).values(
1744
+ {spot_table.c.full_resources: full_resources_json}))
1745
+ session.commit()
1746
+
1747
+
1684
1748
  @_init_db_async
1685
1749
  async def set_job_id_on_pool_cluster_async(job_id: int,
1686
1750
  job_id_on_pool_cluster: int) -> None:
@@ -1857,6 +1921,83 @@ def get_nonterminal_job_ids_by_pool(pool: str,
1857
1921
  return job_ids
1858
1922
 
1859
1923
 
1924
+ def _is_any_of_or_ordered(resource_config: Dict[str, Any]) -> bool:
1925
+ """Check if resource config is heterogeneous (any_of or ordered).
1926
+
1927
+ Args:
1928
+ resource_config: Resource configuration dictionary
1929
+
1930
+ Returns:
1931
+ True if the config contains 'any_of' or 'ordered' keys, indicating
1932
+ heterogeneous resources that haven't been resolved to a specific
1933
+ resource yet.
1934
+ """
1935
+ return 'any_of' in resource_config or 'ordered' in resource_config
1936
+
1937
+
1938
+ @_init_db
1939
+ def get_pool_worker_used_resources(
1940
+ job_ids: Set[int]) -> Optional['resources_lib.Resources']:
1941
+ """Get the total used resources by running jobs.
1942
+
1943
+ Args:
1944
+ job_ids: Set of spot_job_id values to check
1945
+
1946
+ Returns:
1947
+ Resources object with summed resources from all running jobs, or None
1948
+ if we couldn't parse the resources string for any job.
1949
+ """
1950
+ if not job_ids:
1951
+ return None
1952
+
1953
+ assert _SQLALCHEMY_ENGINE is not None
1954
+
1955
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1956
+ # Query spot_table for full_resources. Use full_resources if available,
1957
+ # otherwise fall back to resources for backward compatibility.
1958
+ # Don't check for running status because we want to include jobs that
1959
+ # may have just been scheduled. The job_ids come from
1960
+ # get_nonterminal_job_ids_by_pool anyway so we don't need to worry
1961
+ # about removing old jobs.
1962
+ query = sqlalchemy.select(spot_table.c.full_resources).where(
1963
+ sqlalchemy.and_(spot_table.c.spot_job_id.in_(job_ids)))
1964
+ rows = session.execute(query).fetchall()
1965
+
1966
+ resource_configs = []
1967
+ for row in rows:
1968
+ if row[0] is None:
1969
+ # We don't have full_resources for this job. We should return
1970
+ # none since we can't make any guarantees about what resources
1971
+ # are being used.
1972
+ return None
1973
+ resource_configs.append(row[0])
1974
+
1975
+ # Parse resources dicts into Resources objects and sum them using +
1976
+ total_resources = None
1977
+ # full_resources is now stored as JSON dict from to_yaml_config()
1978
+ for resource_config in resource_configs:
1979
+ # Check if this is an unresolved heterogeneous config (any_of/ordered)
1980
+ if _is_any_of_or_ordered(resource_config):
1981
+ # Can't determine usage for heterogeneous unresolved configs.
1982
+ # Return None to fall back to non-resource-aware scheduling.
1983
+ return None
1984
+
1985
+ resources_set = resources_lib.Resources.from_yaml_config(
1986
+ resource_config)
1987
+ if len(resources_set) == 0:
1988
+ # We couldn't parse the resources JSON. We should return
1989
+ # none since we can't make any guarantees about what resources
1990
+ # are being used.
1991
+ return None
1992
+ # Get the first Resources object from the set/list
1993
+ parsed = next(iter(resources_set))
1994
+ if total_resources is None:
1995
+ total_resources = parsed
1996
+ else:
1997
+ total_resources = total_resources + parsed
1998
+ return total_resources
1999
+
2000
+
1860
2001
  @_init_db_async
1861
2002
  async def get_waiting_job_async(
1862
2003
  pid: int, pid_started_at: float) -> Optional[Dict[str, Any]]:
@@ -1964,14 +2105,30 @@ async def get_latest_task_id_status_async(
1964
2105
 
1965
2106
 
1966
2107
  @_init_db_async
1967
- async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
1968
- submit_time: float, resources_str: str,
2108
+ async def set_starting_async(job_id: int,
2109
+ task_id: int,
2110
+ run_timestamp: str,
2111
+ submit_time: float,
2112
+ resources_str: str,
1969
2113
  specs: Dict[str, Union[str, int]],
1970
- callback_func: AsyncCallbackType):
2114
+ callback_func: AsyncCallbackType,
2115
+ full_resources_json: Optional[Dict[str,
2116
+ Any]] = None):
1971
2117
  """Set the task to starting state."""
2118
+ await add_job_event_async(job_id, task_id, ManagedJobStatus.STARTING,
2119
+ 'Job is starting')
1972
2120
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
1973
2121
  logger.info('Launching the spot cluster...')
1974
2122
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2123
+ values = {
2124
+ spot_table.c.resources: resources_str,
2125
+ spot_table.c.submitted_at: submit_time,
2126
+ spot_table.c.status: ManagedJobStatus.STARTING.value,
2127
+ spot_table.c.run_timestamp: run_timestamp,
2128
+ spot_table.c.specs: json.dumps(specs),
2129
+ }
2130
+ if full_resources_json is not None:
2131
+ values[spot_table.c.full_resources] = full_resources_json
1975
2132
  result = await session.execute(
1976
2133
  sqlalchemy.update(spot_table).where(
1977
2134
  sqlalchemy.and_(
@@ -1979,13 +2136,7 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
1979
2136
  spot_table.c.task_id == task_id,
1980
2137
  spot_table.c.status == ManagedJobStatus.PENDING.value,
1981
2138
  spot_table.c.end_at.is_(None),
1982
- )).values({
1983
- spot_table.c.resources: resources_str,
1984
- spot_table.c.submitted_at: submit_time,
1985
- spot_table.c.status: ManagedJobStatus.STARTING.value,
1986
- spot_table.c.run_timestamp: run_timestamp,
1987
- spot_table.c.specs: json.dumps(specs),
1988
- }))
2139
+ )).values(values))
1989
2140
  count = result.rowcount
1990
2141
  await session.commit()
1991
2142
  if count != 1:
@@ -2003,6 +2154,8 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
2003
2154
  async def set_started_async(job_id: int, task_id: int, start_time: float,
2004
2155
  callback_func: AsyncCallbackType):
2005
2156
  """Set the task to started state."""
2157
+ await add_job_event_async(job_id, task_id, ManagedJobStatus.RUNNING,
2158
+ 'Job has started')
2006
2159
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
2007
2160
  logger.info('Job started.')
2008
2161
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
@@ -2047,10 +2200,23 @@ async def get_job_status_with_task_id_async(
2047
2200
 
2048
2201
 
2049
2202
  @_init_db_async
2050
- async def set_recovering_async(job_id: int, task_id: int,
2051
- force_transit_to_recovering: bool,
2052
- callback_func: AsyncCallbackType):
2203
+ async def set_recovering_async(
2204
+ job_id: int,
2205
+ task_id: int,
2206
+ force_transit_to_recovering: bool,
2207
+ callback_func: AsyncCallbackType,
2208
+ external_failures: Optional[List[ExternalClusterFailure]] = None,
2209
+ ):
2053
2210
  """Set the task to recovering state, and update the job duration."""
2211
+ # Build code and reason from external failures for the event log
2212
+ if external_failures:
2213
+ code = '; '.join(f.code for f in external_failures)
2214
+ reason = '; '.join(f.reason for f in external_failures)
2215
+ else:
2216
+ code = None
2217
+ reason = 'Cluster preempted or failed, recovering'
2218
+ await add_job_event_async(job_id, task_id, ManagedJobStatus.RECOVERING,
2219
+ reason, code)
2054
2220
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
2055
2221
  logger.info('=== Recovering... ===')
2056
2222
  current_time = time.time()
@@ -2099,6 +2265,8 @@ async def set_recovering_async(job_id: int, task_id: int,
2099
2265
  async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
2100
2266
  callback_func: AsyncCallbackType):
2101
2267
  """Set the task to recovered."""
2268
+ await add_job_event_async(job_id, task_id, ManagedJobStatus.RUNNING,
2269
+ 'Job has recovered')
2102
2270
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
2103
2271
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2104
2272
  result = await session.execute(
@@ -2131,6 +2299,8 @@ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
2131
2299
  async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
2132
2300
  callback_func: AsyncCallbackType):
2133
2301
  """Set the task to succeeded, if it is in a non-terminal state."""
2302
+ await add_job_event_async(job_id, task_id, ManagedJobStatus.SUCCEEDED,
2303
+ 'Job has succeeded')
2134
2304
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
2135
2305
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2136
2306
  result = await session.execute(
@@ -2168,6 +2338,8 @@ async def set_failed_async(
2168
2338
  override_terminal: bool = False,
2169
2339
  ):
2170
2340
  """Set an entire job or task to failed."""
2341
+ await add_job_event_async(job_id, task_id, failure_type,
2342
+ f'Job failed: {failure_reason}')
2171
2343
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
2172
2344
  assert failure_type.is_failed(), failure_type
2173
2345
  end_time = time.time() if end_time is None else end_time
@@ -2217,10 +2389,59 @@ async def set_failed_async(
2217
2389
  logger.info(failure_reason)
2218
2390
 
2219
2391
 
2392
+ @_init_db_async
2393
+ async def update_links_async(job_id: int, task_id: int,
2394
+ links: Dict[str, str]) -> None:
2395
+ """Update the links for a managed job task.
2396
+
2397
+ Links are stored as JSON in the database. SQLAlchemy handles
2398
+ serialization/deserialization automatically.
2399
+
2400
+ Uses a transaction to ensure atomicity. For PostgreSQL, we use row-level
2401
+ locking (SELECT FOR UPDATE). For SQLite, row-level locking is not
2402
+ supported, so we rely on SQLite's database-level write locking which
2403
+ provides serializable isolation for write transactions.
2404
+ """
2405
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2406
+ logger.info(f'Updating external links with: {links}')
2407
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2408
+ async with session.begin():
2409
+ # Build the select query
2410
+ select_query = sqlalchemy.select(spot_table.c.links).where(
2411
+ sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
2412
+ spot_table.c.task_id == task_id))
2413
+
2414
+ # Use row-level locking for PostgreSQL; SQLite doesn't support
2415
+ # SELECT FOR UPDATE but provides database-level write locking
2416
+ if (_SQLALCHEMY_ENGINE_ASYNC.dialect.name ==
2417
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
2418
+ select_query = select_query.with_for_update()
2419
+
2420
+ result = await session.execute(select_query)
2421
+ existing_links_row = result.fetchone()
2422
+ existing_links = {}
2423
+ if existing_links_row and existing_links_row[0]:
2424
+ existing_links = existing_links_row[0]
2425
+
2426
+ # Merge new links into existing
2427
+ existing_links.update(links)
2428
+
2429
+ # Update the database (SQLAlchemy JSON type handles serialization)
2430
+ await session.execute(
2431
+ sqlalchemy.update(spot_table).where(
2432
+ sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
2433
+ spot_table.c.task_id == task_id)).values({
2434
+ spot_table.c.links: existing_links,
2435
+ }))
2436
+ # Transaction commits automatically when exiting the context
2437
+
2438
+
2220
2439
  @_init_db_async
2221
2440
  async def set_cancelling_async(job_id: int, callback_func: AsyncCallbackType):
2222
2441
  """Set tasks in the job as cancelling, if they are in non-terminal
2223
2442
  states."""
2443
+ await add_job_event_async(job_id, None, ManagedJobStatus.CANCELLING,
2444
+ 'Job is cancelling')
2224
2445
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
2225
2446
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2226
2447
  result = await session.execute(
@@ -2243,6 +2464,8 @@ async def set_cancelling_async(job_id: int, callback_func: AsyncCallbackType):
2243
2464
  @_init_db_async
2244
2465
  async def set_cancelled_async(job_id: int, callback_func: AsyncCallbackType):
2245
2466
  """Set tasks in the job as cancelled, if they are in CANCELLING state."""
2467
+ await add_job_event_async(job_id, None, ManagedJobStatus.CANCELLED,
2468
+ 'Job has been cancelled')
2246
2469
  assert _SQLALCHEMY_ENGINE_ASYNC is not None
2247
2470
  async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2248
2471
  result = await session.execute(
@@ -2519,3 +2742,178 @@ def set_controller_logs_cleaned(job_ids: List[int], logs_cleaned_at: float):
2519
2742
  job_info_table.c.spot_job_id.in_(job_ids)).values(
2520
2743
  controller_logs_cleaned_at=logs_cleaned_at))
2521
2744
  session.commit()
2745
+
2746
+
2747
+ @_init_db
2748
+ def add_job_event(job_id: int,
2749
+ task_id: Optional[int],
2750
+ new_status: ManagedJobStatus,
2751
+ reason: str,
2752
+ timestamp: Optional[datetime.datetime] = None) -> None:
2753
+ """Add a job event record to the audit log.
2754
+
2755
+ Args:
2756
+ job_id: The spot_job_id of the managed job.
2757
+ task_id: The task_id within the managed job. If None, adds a
2758
+ job-level event that applies to all tasks.
2759
+ new_status: The new status being transitioned to. Can be a
2760
+ ManagedJobStatus enum.
2761
+ reason: A description of why the event occurred.
2762
+ timestamp: The timestamp of the event. If None, uses current time.
2763
+ """
2764
+ if timestamp is None:
2765
+ timestamp = datetime.datetime.now()
2766
+
2767
+ status_value = new_status.value
2768
+
2769
+ assert _SQLALCHEMY_ENGINE is not None
2770
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2771
+ session.execute(job_events_table.insert().values(
2772
+ spot_job_id=job_id,
2773
+ task_id=task_id, # Can be None for job-level events
2774
+ new_status=status_value,
2775
+ reason=reason,
2776
+ timestamp=timestamp,
2777
+ ))
2778
+ session.commit()
2779
+
2780
+
2781
+ async def _get_all_task_ids_async(job_id: int) -> List[int]:
2782
+ """Get all task IDs for a job (async version)."""
2783
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2784
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2785
+ result = await session.execute(
2786
+ sqlalchemy.select(spot_table.c.task_id).where(
2787
+ spot_table.c.spot_job_id == job_id).order_by(
2788
+ spot_table.c.task_id.asc()))
2789
+ return [row[0] for row in result.fetchall()]
2790
+
2791
+
2792
+ @_init_db_async
2793
+ async def add_job_event_async(
2794
+ job_id: int,
2795
+ task_id: Optional[int],
2796
+ new_status: ManagedJobStatus,
2797
+ reason: str,
2798
+ code: Optional[str] = None,
2799
+ timestamp: Optional[datetime.datetime] = None) -> None:
2800
+ """Add a job event record to the audit log (async version).
2801
+
2802
+ Args:
2803
+ job_id: The spot_job_id of the managed job.
2804
+ task_id: The task_id within the managed job. If None, adds a
2805
+ job-level event that applies to all tasks.
2806
+ new_status: The new status being transitioned to. Can be a
2807
+ ManagedJobStatus enum.
2808
+ reason: A description of why the event occurred.
2809
+ code: Optional error category code for failures.
2810
+ timestamp: The timestamp of the event. If None, uses current time.
2811
+ """
2812
+ if timestamp is None:
2813
+ timestamp = datetime.datetime.now()
2814
+
2815
+ status_value = new_status.value
2816
+
2817
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2818
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2819
+ await session.execute(job_events_table.insert().values(
2820
+ spot_job_id=job_id,
2821
+ task_id=task_id, # Can be None for job-level events
2822
+ new_status=status_value,
2823
+ code=code,
2824
+ reason=reason,
2825
+ timestamp=timestamp,
2826
+ ))
2827
+ await session.commit()
2828
+
2829
+
2830
+ @_init_db
2831
+ def get_job_events(job_id: int,
2832
+ task_id: Optional[int] = None,
2833
+ limit: Optional[int] = None) -> List[Dict[str, Any]]:
2834
+ """Get task events for a managed job.
2835
+
2836
+ Args:
2837
+ job_id: The spot_job_id of the managed job.
2838
+ task_id: Optional task_id to filter by. If None, returns events
2839
+ for all tasks. If specified, returns events for that task plus
2840
+ job-level events (where task_id is None).
2841
+ limit: Optional limit on number of events to return. If specified,
2842
+ returns the most recent N events.
2843
+
2844
+ Returns:
2845
+ List of event records, ordered by timestamp descending
2846
+ (most recent first) if limit is specified, otherwise ascending.
2847
+ """
2848
+ assert _SQLALCHEMY_ENGINE is not None
2849
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
2850
+ query = sqlalchemy.select(
2851
+ job_events_table.c.spot_job_id,
2852
+ job_events_table.c.task_id,
2853
+ job_events_table.c.new_status,
2854
+ job_events_table.c.code,
2855
+ job_events_table.c.reason,
2856
+ job_events_table.c.timestamp,
2857
+ ).where(job_events_table.c.spot_job_id == job_id)
2858
+
2859
+ if task_id is not None:
2860
+ # Include events for the specific task AND job-level events
2861
+ # (task_id is None)
2862
+ query = query.where(
2863
+ sqlalchemy.or_(job_events_table.c.task_id == task_id,
2864
+ job_events_table.c.task_id.is_(None)))
2865
+
2866
+ # Order by timestamp descending to get most recent first
2867
+ query = query.order_by(job_events_table.c.timestamp.desc())
2868
+
2869
+ if limit is not None:
2870
+ query = query.limit(limit)
2871
+
2872
+ rows = session.execute(query).fetchall()
2873
+ return [{
2874
+ 'spot_job_id': row[0],
2875
+ 'task_id': row[1],
2876
+ 'new_status': ManagedJobStatus(row[2]),
2877
+ 'code': row[3],
2878
+ 'reason': row[4],
2879
+ 'timestamp': row[5],
2880
+ } for row in rows]
2881
+
2882
+
2883
+ @_init_db_async
2884
+ async def cleanup_job_events_with_retention_async(
2885
+ retention_hours: float) -> None:
2886
+ """Delete job events older than the retention period.
2887
+
2888
+ Args:
2889
+ retention_hours: Number of hours to retain job events.
2890
+ """
2891
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2892
+ cutoff_time = datetime.datetime.now() - datetime.timedelta(
2893
+ hours=retention_hours)
2894
+
2895
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2896
+ result = await session.execute(
2897
+ sqlalchemy.delete(job_events_table).where(
2898
+ job_events_table.c.timestamp < cutoff_time))
2899
+ count = result.rowcount
2900
+ if count > 0:
2901
+ logger.debug(f'Deleted {count} job events older than '
2902
+ f'{retention_hours} hours.')
2903
+ await session.commit()
2904
+
2905
+
2906
+ async def job_event_retention_daemon():
2907
+ """Garbage collect job events periodically."""
2908
+ while True:
2909
+ logger.info('Running job event retention daemon...')
2910
+ try:
2911
+ await cleanup_job_events_with_retention_async(
2912
+ DEFAULT_JOB_EVENT_RETENTION_HOURS)
2913
+ except asyncio.CancelledError:
2914
+ logger.info('Job event retention daemon cancelled')
2915
+ break
2916
+ except Exception as e: # pylint: disable=broad-except
2917
+ logger.error(f'Error running job event retention daemon: {e}')
2918
+
2919
+ await asyncio.sleep(JOB_EVENT_DAEMON_INTERVAL_SECONDS)