skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -10,8 +10,7 @@ import sqlite3
10
10
  import threading
11
11
  import time
12
12
  import typing
13
- from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
14
- Union)
13
+ from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
15
14
  import urllib.parse
16
15
 
17
16
  import colorama
@@ -94,6 +93,7 @@ spot_table = sqlalchemy.Table(
94
93
  sqlalchemy.Column('specs', sqlalchemy.Text),
95
94
  sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
96
95
  sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
96
+ sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
97
97
  )
98
98
 
99
99
  job_info_table = sqlalchemy.Table(
@@ -109,6 +109,8 @@ job_info_table = sqlalchemy.Table(
109
109
  server_default=None),
110
110
  sqlalchemy.Column('dag_yaml_path', sqlalchemy.Text),
111
111
  sqlalchemy.Column('env_file_path', sqlalchemy.Text),
112
+ sqlalchemy.Column('dag_yaml_content', sqlalchemy.Text, server_default=None),
113
+ sqlalchemy.Column('env_file_content', sqlalchemy.Text, server_default=None),
112
114
  sqlalchemy.Column('user_hash', sqlalchemy.Text),
113
115
  sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
114
116
  sqlalchemy.Column('priority',
@@ -118,6 +120,9 @@ job_info_table = sqlalchemy.Table(
118
120
  sqlalchemy.Column('original_user_yaml_path',
119
121
  sqlalchemy.Text,
120
122
  server_default=None),
123
+ sqlalchemy.Column('original_user_yaml_content',
124
+ sqlalchemy.Text,
125
+ server_default=None),
121
126
  sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
122
127
  sqlalchemy.Column('current_cluster_name',
123
128
  sqlalchemy.Text,
@@ -126,6 +131,9 @@ job_info_table = sqlalchemy.Table(
126
131
  sqlalchemy.Integer,
127
132
  server_default=None),
128
133
  sqlalchemy.Column('pool_hash', sqlalchemy.Text, server_default=None),
134
+ sqlalchemy.Column('controller_logs_cleaned_at',
135
+ sqlalchemy.Float,
136
+ server_default=None),
129
137
  )
130
138
 
131
139
  ha_recovery_script_table = sqlalchemy.Table(
@@ -280,6 +288,27 @@ def _init_db(func):
280
288
  return wrapper
281
289
 
282
290
 
291
+ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
292
+ job_id: int, task_id: int) -> str:
293
+ """Return a human-readable description when a task transition fails."""
294
+ details = 'Couldn\'t fetch the task details.'
295
+ try:
296
+ debug_result = await session.execute(
297
+ sqlalchemy.select(spot_table.c.status, spot_table.c.end_at).where(
298
+ sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
299
+ spot_table.c.task_id == task_id)))
300
+ rows = debug_result.mappings().all()
301
+ details = (f'{len(rows)} rows matched job {job_id} and task '
302
+ f'{task_id}.')
303
+ for row in rows:
304
+ status = row['status']
305
+ end_at = row['end_at']
306
+ details += f' Status: {status}, End time: {end_at}.'
307
+ except Exception as exc: # pylint: disable=broad-except
308
+ details += f' Error fetching task details: {exc}'
309
+ return details
310
+
311
+
283
312
  # job_duration is the time a job actually runs (including the
284
313
  # setup duration) before last_recover, excluding the provision
285
314
  # and recovery time.
@@ -293,42 +322,50 @@ def _init_db(func):
293
322
  # column names in the DB and it corresponds to the combined view
294
323
  # by joining the spot and job_info tables.
295
324
  def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
325
+ # WARNING: If you update these you may also need to update GetJobTable in
326
+ # the skylet ManagedJobsServiceImpl.
296
327
  return {
297
- '_job_id': r['job_id'], # from spot table
298
- '_task_name': r['job_name'], # deprecated, from spot table
299
- 'resources': r['resources'],
300
- 'submitted_at': r['submitted_at'],
301
- 'status': r['status'],
302
- 'run_timestamp': r['run_timestamp'],
303
- 'start_at': r['start_at'],
304
- 'end_at': r['end_at'],
305
- 'last_recovered_at': r['last_recovered_at'],
306
- 'recovery_count': r['recovery_count'],
307
- 'job_duration': r['job_duration'],
308
- 'failure_reason': r['failure_reason'],
309
- 'job_id': r[spot_table.c.spot_job_id], # ambiguous, use table.column
310
- 'task_id': r['task_id'],
311
- 'task_name': r['task_name'],
312
- 'specs': r['specs'],
313
- 'local_log_file': r['local_log_file'],
314
- 'metadata': r['metadata'],
328
+ '_job_id': r.get('job_id'), # from spot table
329
+ '_task_name': r.get('job_name'), # deprecated, from spot table
330
+ 'resources': r.get('resources'),
331
+ 'submitted_at': r.get('submitted_at'),
332
+ 'status': r.get('status'),
333
+ 'run_timestamp': r.get('run_timestamp'),
334
+ 'start_at': r.get('start_at'),
335
+ 'end_at': r.get('end_at'),
336
+ 'last_recovered_at': r.get('last_recovered_at'),
337
+ 'recovery_count': r.get('recovery_count'),
338
+ 'job_duration': r.get('job_duration'),
339
+ 'failure_reason': r.get('failure_reason'),
340
+ 'job_id': r.get(spot_table.c.spot_job_id
341
+ ), # ambiguous, use table.column
342
+ 'task_id': r.get('task_id'),
343
+ 'task_name': r.get('task_name'),
344
+ 'specs': r.get('specs'),
345
+ 'local_log_file': r.get('local_log_file'),
346
+ 'metadata': r.get('metadata'),
315
347
  # columns from job_info table (some may be None for legacy jobs)
316
- '_job_info_job_id': r[job_info_table.c.spot_job_id
317
- ], # ambiguous, use table.column
318
- 'job_name': r['name'], # from job_info table
319
- 'schedule_state': r['schedule_state'],
320
- 'controller_pid': r['controller_pid'],
321
- 'dag_yaml_path': r['dag_yaml_path'],
322
- 'env_file_path': r['env_file_path'],
323
- 'user_hash': r['user_hash'],
324
- 'workspace': r['workspace'],
325
- 'priority': r['priority'],
326
- 'entrypoint': r['entrypoint'],
327
- 'original_user_yaml_path': r['original_user_yaml_path'],
328
- 'pool': r['pool'],
329
- 'current_cluster_name': r['current_cluster_name'],
330
- 'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
331
- 'pool_hash': r['pool_hash'],
348
+ '_job_info_job_id': r.get(job_info_table.c.spot_job_id
349
+ ), # ambiguous, use table.column
350
+ 'job_name': r.get('name'), # from job_info table
351
+ 'schedule_state': r.get('schedule_state'),
352
+ 'controller_pid': r.get('controller_pid'),
353
+ # the _path columns are for backwards compatibility, use the _content
354
+ # columns instead
355
+ 'dag_yaml_path': r.get('dag_yaml_path'),
356
+ 'env_file_path': r.get('env_file_path'),
357
+ 'dag_yaml_content': r.get('dag_yaml_content'),
358
+ 'env_file_content': r.get('env_file_content'),
359
+ 'user_hash': r.get('user_hash'),
360
+ 'workspace': r.get('workspace'),
361
+ 'priority': r.get('priority'),
362
+ 'entrypoint': r.get('entrypoint'),
363
+ 'original_user_yaml_path': r.get('original_user_yaml_path'),
364
+ 'original_user_yaml_content': r.get('original_user_yaml_content'),
365
+ 'pool': r.get('pool'),
366
+ 'current_cluster_name': r.get('current_cluster_name'),
367
+ 'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
368
+ 'pool_hash': r.get('pool_hash'),
332
369
  }
333
370
 
334
371
 
@@ -671,8 +708,8 @@ class ManagedJobScheduleState(enum.Enum):
671
708
  # === Status transition functions ===
672
709
  @_init_db
673
710
  def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
674
- pool: Optional[str],
675
- pool_hash: Optional[str]) -> int:
711
+ pool: Optional[str], pool_hash: Optional[str],
712
+ user_hash: Optional[str]) -> int:
676
713
  assert _SQLALCHEMY_ENGINE is not None
677
714
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
678
715
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -691,6 +728,7 @@ def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
691
728
  entrypoint=entrypoint,
692
729
  pool=pool,
693
730
  pool_hash=pool_hash,
731
+ user_hash=user_hash,
694
732
  )
695
733
 
696
734
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -758,9 +796,12 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
758
796
  count = result.rowcount
759
797
  await session.commit()
760
798
  if count != 1:
761
- raise exceptions.ManagedJobStatusError(
762
- 'Failed to set the task back to pending. '
763
- f'({count} rows updated)')
799
+ details = await _describe_task_transition_failure(
800
+ session, job_id, task_id)
801
+ message = ('Failed to set the task back to pending. '
802
+ f'({count} rows updated. {details})')
803
+ logger.error(message)
804
+ raise exceptions.ManagedJobStatusError(message)
764
805
  # Do not call callback_func here, as we don't use the callback for PENDING.
765
806
 
766
807
 
@@ -789,9 +830,12 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
789
830
  await session.commit()
790
831
  logger.debug(f'back to {target_status}')
791
832
  if count != 1:
792
- raise exceptions.ManagedJobStatusError(
793
- f'Failed to set the task back to {target_status}. '
794
- f'({count} rows updated)')
833
+ details = await _describe_task_transition_failure(
834
+ session, job_id, task_id)
835
+ message = (f'Failed to set the task back to {target_status}. '
836
+ f'({count} rows updated. {details})')
837
+ logger.error(message)
838
+ raise exceptions.ManagedJobStatusError(message)
795
839
  # Do not call callback_func here, as it should only be invoked for the
796
840
  # initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
797
841
 
@@ -1048,7 +1092,8 @@ def _get_all_task_ids_statuses(
1048
1092
 
1049
1093
  @_init_db
1050
1094
  def get_all_task_ids_names_statuses_logs(
1051
- job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
1095
+ job_id: int
1096
+ ) -> List[Tuple[int, str, ManagedJobStatus, str, Optional[float]]]:
1052
1097
  assert _SQLALCHEMY_ENGINE is not None
1053
1098
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1054
1099
  id_names = session.execute(
@@ -1057,9 +1102,10 @@ def get_all_task_ids_names_statuses_logs(
1057
1102
  spot_table.c.task_name,
1058
1103
  spot_table.c.status,
1059
1104
  spot_table.c.local_log_file,
1105
+ spot_table.c.logs_cleaned_at,
1060
1106
  ).where(spot_table.c.spot_job_id == job_id).order_by(
1061
1107
  spot_table.c.task_id.asc())).fetchall()
1062
- return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
1108
+ return [(row[0], row[1], ManagedJobStatus(row[2]), row[3], row[4])
1063
1109
  for row in id_names]
1064
1110
 
1065
1111
 
@@ -1124,8 +1170,8 @@ def get_failure_reason(job_id: int) -> Optional[str]:
1124
1170
 
1125
1171
 
1126
1172
  @_init_db
1127
- def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1128
- """Get managed jobs from the database."""
1173
+ def get_managed_job_tasks(job_id: int) -> List[Dict[str, Any]]:
1174
+ """Get managed job tasks for a specific managed job id from the database."""
1129
1175
  assert _SQLALCHEMY_ENGINE is not None
1130
1176
 
1131
1177
  # Join spot and job_info tables to get the job name for each task.
@@ -1140,10 +1186,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1140
1186
  spot_table.outerjoin(
1141
1187
  job_info_table,
1142
1188
  spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1143
- if job_id is not None:
1144
- query = query.where(spot_table.c.spot_job_id == job_id)
1145
- query = query.order_by(spot_table.c.spot_job_id.desc(),
1146
- spot_table.c.task_id.asc())
1189
+ query = query.where(spot_table.c.spot_job_id == job_id)
1190
+ query = query.order_by(spot_table.c.task_id.asc())
1147
1191
  rows = None
1148
1192
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1149
1193
  rows = session.execute(query).fetchall()
@@ -1158,20 +1202,307 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1158
1202
  job_dict['metadata'] = json.loads(job_dict['metadata'])
1159
1203
 
1160
1204
  # Add user YAML content for managed jobs.
1161
- yaml_path = job_dict.get('original_user_yaml_path')
1162
- if yaml_path:
1163
- try:
1164
- with open(yaml_path, 'r', encoding='utf-8') as f:
1165
- job_dict['user_yaml'] = f.read()
1166
- except (FileNotFoundError, IOError, OSError):
1167
- job_dict['user_yaml'] = None
1168
- else:
1169
- job_dict['user_yaml'] = None
1205
+ job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
1206
+ if job_dict['user_yaml'] is None:
1207
+ # Backwards compatibility - try to read from file path
1208
+ yaml_path = job_dict.get('original_user_yaml_path')
1209
+ if yaml_path:
1210
+ try:
1211
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1212
+ job_dict['user_yaml'] = f.read()
1213
+ except (FileNotFoundError, IOError, OSError) as e:
1214
+ logger.debug('Failed to read original user YAML for job '
1215
+ f'{job_id} from {yaml_path}: {e}')
1170
1216
 
1171
1217
  jobs.append(job_dict)
1172
1218
  return jobs
1173
1219
 
1174
1220
 
1221
+ def _map_response_field_to_db_column(field: str):
1222
+ """Map the response field name to an actual SQLAlchemy ColumnElement.
1223
+
1224
+ This ensures we never pass plain strings to SQLAlchemy 2.0 APIs like
1225
+ Select.with_only_columns().
1226
+ """
1227
+ # Explicit aliases differing from actual DB column names
1228
+ alias_mapping = {
1229
+ '_job_id': spot_table.c.job_id, # spot.job_id
1230
+ '_task_name': spot_table.c.job_name, # deprecated, from spot table
1231
+ 'job_id': spot_table.c.spot_job_id, # public job id -> spot.spot_job_id
1232
+ '_job_info_job_id': job_info_table.c.spot_job_id,
1233
+ 'job_name': job_info_table.c.name, # public job name -> job_info.name
1234
+ }
1235
+ if field in alias_mapping:
1236
+ return alias_mapping[field]
1237
+
1238
+ # Try direct match on the `spot` table columns
1239
+ if field in spot_table.c:
1240
+ return spot_table.c[field]
1241
+
1242
+ # Try direct match on the `job_info` table columns
1243
+ if field in job_info_table.c:
1244
+ return job_info_table.c[field]
1245
+
1246
+ raise ValueError(f'Unknown field: {field}')
1247
+
1248
+
1249
+ @_init_db
1250
+ def get_managed_jobs_total() -> int:
1251
+ """Get the total number of managed jobs."""
1252
+ assert _SQLALCHEMY_ENGINE is not None
1253
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1254
+ result = session.execute(
1255
+ sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
1256
+ ).select_from(spot_table)).fetchone()
1257
+ return result[0] if result else 0
1258
+
1259
+
1260
+ @_init_db
1261
+ def get_managed_jobs_highest_priority() -> int:
1262
+ """Get the highest priority of the managed jobs."""
1263
+ assert _SQLALCHEMY_ENGINE is not None
1264
+ query = sqlalchemy.select(sqlalchemy.func.max(
1265
+ job_info_table.c.priority)).where(
1266
+ sqlalchemy.and_(
1267
+ job_info_table.c.schedule_state.in_([
1268
+ ManagedJobScheduleState.LAUNCHING.value,
1269
+ ManagedJobScheduleState.ALIVE_BACKOFF.value,
1270
+ ManagedJobScheduleState.WAITING.value,
1271
+ ManagedJobScheduleState.ALIVE_WAITING.value,
1272
+ ]),
1273
+ job_info_table.c.priority.is_not(None),
1274
+ ))
1275
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1276
+ priority = session.execute(query).fetchone()
1277
+ return priority[0] if priority and priority[
1278
+ 0] is not None else constants.MIN_PRIORITY
1279
+
1280
+
1281
+ def build_managed_jobs_with_filters_no_status_query(
1282
+ fields: Optional[List[str]] = None,
1283
+ job_ids: Optional[List[int]] = None,
1284
+ accessible_workspaces: Optional[List[str]] = None,
1285
+ workspace_match: Optional[str] = None,
1286
+ name_match: Optional[str] = None,
1287
+ pool_match: Optional[str] = None,
1288
+ user_hashes: Optional[List[Optional[str]]] = None,
1289
+ skip_finished: bool = False,
1290
+ count_only: bool = False,
1291
+ status_count: bool = False,
1292
+ ) -> sqlalchemy.Select:
1293
+ """Build a query to get managed jobs from the database with filters."""
1294
+ # Join spot and job_info tables to get the job name for each task.
1295
+ # We use LEFT OUTER JOIN mainly for backward compatibility, as for an
1296
+ # existing controller before #1982, the job_info table may not exist,
1297
+ # and all the managed jobs created before will not present in the
1298
+ # job_info.
1299
+ # Note: we will get the user_hash here, but don't try to call
1300
+ # global_user_state.get_user() on it. This runs on the controller, which may
1301
+ # not have the user info. Prefer to do it on the API server side.
1302
+ if count_only:
1303
+ query = sqlalchemy.select(sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
1304
+ elif status_count:
1305
+ query = sqlalchemy.select(spot_table.c.status,
1306
+ sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
1307
+ else:
1308
+ query = sqlalchemy.select(spot_table, job_info_table)
1309
+ query = query.select_from(
1310
+ spot_table.outerjoin(
1311
+ job_info_table,
1312
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1313
+ if skip_finished:
1314
+ # Filter out finished jobs at the DB level. If a multi-task job is
1315
+ # partially finished, include all its tasks. We do this by first
1316
+ # selecting job_ids that have at least one non-terminal task, then
1317
+ # restricting the main query to those job_ids.
1318
+ terminal_status_values = [
1319
+ s.value for s in ManagedJobStatus.terminal_statuses()
1320
+ ]
1321
+ non_terminal_job_ids_subquery = (sqlalchemy.select(
1322
+ spot_table.c.spot_job_id).where(
1323
+ sqlalchemy.or_(
1324
+ spot_table.c.status.is_(None),
1325
+ sqlalchemy.not_(
1326
+ spot_table.c.status.in_(terminal_status_values)),
1327
+ )).distinct())
1328
+ query = query.where(
1329
+ spot_table.c.spot_job_id.in_(non_terminal_job_ids_subquery))
1330
+ if not count_only and not status_count and fields:
1331
+ # Resolve requested field names to explicit ColumnElements from
1332
+ # the joined tables.
1333
+ selected_columns = [_map_response_field_to_db_column(f) for f in fields]
1334
+ query = query.with_only_columns(*selected_columns)
1335
+ if job_ids is not None:
1336
+ query = query.where(spot_table.c.spot_job_id.in_(job_ids))
1337
+ if accessible_workspaces is not None:
1338
+ query = query.where(
1339
+ job_info_table.c.workspace.in_(accessible_workspaces))
1340
+ if workspace_match is not None:
1341
+ query = query.where(
1342
+ job_info_table.c.workspace.like(f'%{workspace_match}%'))
1343
+ if name_match is not None:
1344
+ query = query.where(job_info_table.c.name.like(f'%{name_match}%'))
1345
+ if pool_match is not None:
1346
+ query = query.where(job_info_table.c.pool.like(f'%{pool_match}%'))
1347
+ if user_hashes is not None:
1348
+ query = query.where(job_info_table.c.user_hash.in_(user_hashes))
1349
+ return query
1350
+
1351
+
1352
+ def build_managed_jobs_with_filters_query(
1353
+ fields: Optional[List[str]] = None,
1354
+ job_ids: Optional[List[int]] = None,
1355
+ accessible_workspaces: Optional[List[str]] = None,
1356
+ workspace_match: Optional[str] = None,
1357
+ name_match: Optional[str] = None,
1358
+ pool_match: Optional[str] = None,
1359
+ user_hashes: Optional[List[Optional[str]]] = None,
1360
+ statuses: Optional[List[str]] = None,
1361
+ skip_finished: bool = False,
1362
+ count_only: bool = False,
1363
+ ) -> sqlalchemy.Select:
1364
+ """Build a query to get managed jobs from the database with filters."""
1365
+ query = build_managed_jobs_with_filters_no_status_query(
1366
+ fields=fields,
1367
+ job_ids=job_ids,
1368
+ accessible_workspaces=accessible_workspaces,
1369
+ workspace_match=workspace_match,
1370
+ name_match=name_match,
1371
+ pool_match=pool_match,
1372
+ user_hashes=user_hashes,
1373
+ skip_finished=skip_finished,
1374
+ count_only=count_only,
1375
+ )
1376
+ if statuses is not None:
1377
+ query = query.where(spot_table.c.status.in_(statuses))
1378
+ return query
1379
+
1380
+
1381
+ @_init_db
1382
+ def get_status_count_with_filters(
1383
+ fields: Optional[List[str]] = None,
1384
+ job_ids: Optional[List[int]] = None,
1385
+ accessible_workspaces: Optional[List[str]] = None,
1386
+ workspace_match: Optional[str] = None,
1387
+ name_match: Optional[str] = None,
1388
+ pool_match: Optional[str] = None,
1389
+ user_hashes: Optional[List[Optional[str]]] = None,
1390
+ skip_finished: bool = False,
1391
+ ) -> Dict[str, int]:
1392
+ """Get the status count of the managed jobs with filters."""
1393
+ query = build_managed_jobs_with_filters_no_status_query(
1394
+ fields=fields,
1395
+ job_ids=job_ids,
1396
+ accessible_workspaces=accessible_workspaces,
1397
+ workspace_match=workspace_match,
1398
+ name_match=name_match,
1399
+ pool_match=pool_match,
1400
+ user_hashes=user_hashes,
1401
+ skip_finished=skip_finished,
1402
+ status_count=True,
1403
+ )
1404
+ query = query.group_by(spot_table.c.status)
1405
+ results: Dict[str, int] = {}
1406
+ assert _SQLALCHEMY_ENGINE is not None
1407
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1408
+ rows = session.execute(query).fetchall()
1409
+ for status_value, count in rows:
1410
+ # status_value is already a string (enum value)
1411
+ results[str(status_value)] = int(count)
1412
+ return results
1413
+
1414
+
1415
+ @_init_db
1416
+ def get_managed_jobs_with_filters(
1417
+ fields: Optional[List[str]] = None,
1418
+ job_ids: Optional[List[int]] = None,
1419
+ accessible_workspaces: Optional[List[str]] = None,
1420
+ workspace_match: Optional[str] = None,
1421
+ name_match: Optional[str] = None,
1422
+ pool_match: Optional[str] = None,
1423
+ user_hashes: Optional[List[Optional[str]]] = None,
1424
+ statuses: Optional[List[str]] = None,
1425
+ skip_finished: bool = False,
1426
+ page: Optional[int] = None,
1427
+ limit: Optional[int] = None,
1428
+ ) -> Tuple[List[Dict[str, Any]], int]:
1429
+ """Get managed jobs from the database with filters.
1430
+
1431
+ Returns:
1432
+ A tuple containing
1433
+ - the list of managed jobs
1434
+ - the total number of managed jobs
1435
+ """
1436
+ assert _SQLALCHEMY_ENGINE is not None
1437
+
1438
+ count_query = build_managed_jobs_with_filters_query(
1439
+ fields=None,
1440
+ job_ids=job_ids,
1441
+ accessible_workspaces=accessible_workspaces,
1442
+ workspace_match=workspace_match,
1443
+ name_match=name_match,
1444
+ pool_match=pool_match,
1445
+ user_hashes=user_hashes,
1446
+ statuses=statuses,
1447
+ skip_finished=skip_finished,
1448
+ count_only=True,
1449
+ )
1450
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1451
+ total = session.execute(count_query).fetchone()[0]
1452
+
1453
+ query = build_managed_jobs_with_filters_query(
1454
+ fields=fields,
1455
+ job_ids=job_ids,
1456
+ accessible_workspaces=accessible_workspaces,
1457
+ workspace_match=workspace_match,
1458
+ name_match=name_match,
1459
+ pool_match=pool_match,
1460
+ user_hashes=user_hashes,
1461
+ statuses=statuses,
1462
+ skip_finished=skip_finished,
1463
+ )
1464
+ query = query.order_by(spot_table.c.spot_job_id.desc(),
1465
+ spot_table.c.task_id.asc())
1466
+ if page is not None and limit is not None:
1467
+ query = query.offset((page - 1) * limit).limit(limit)
1468
+ rows = None
1469
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1470
+ rows = session.execute(query).fetchall()
1471
+ jobs = []
1472
+ for row in rows:
1473
+ job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1474
+ if job_dict.get('status') is not None:
1475
+ job_dict['status'] = ManagedJobStatus(job_dict['status'])
1476
+ if job_dict.get('schedule_state') is not None:
1477
+ job_dict['schedule_state'] = ManagedJobScheduleState(
1478
+ job_dict['schedule_state'])
1479
+ if job_dict.get('job_name') is None:
1480
+ job_dict['job_name'] = job_dict.get('task_name')
1481
+ if job_dict.get('metadata') is not None:
1482
+ job_dict['metadata'] = json.loads(job_dict['metadata'])
1483
+
1484
+ # Add user YAML content for managed jobs.
1485
+ job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
1486
+ if job_dict['user_yaml'] is None:
1487
+ # Backwards compatibility - try to read from file path
1488
+ yaml_path = job_dict.get('original_user_yaml_path')
1489
+ if yaml_path:
1490
+ try:
1491
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1492
+ job_dict['user_yaml'] = f.read()
1493
+ except (FileNotFoundError, IOError, OSError) as e:
1494
+ job_id = job_dict.get('job_id')
1495
+ if job_id is not None:
1496
+ logger.debug('Failed to read original user YAML for '
1497
+ f'job {job_id} from {yaml_path}: {e}')
1498
+ else:
1499
+ logger.debug('Failed to read original user YAML from '
1500
+ f'{yaml_path}: {e}')
1501
+
1502
+ jobs.append(job_dict)
1503
+ return jobs, total
1504
+
1505
+
1175
1506
  @_init_db
1176
1507
  def get_task_name(job_id: int, task_id: int) -> str:
1177
1508
  """Get the task name of a job."""
@@ -1212,9 +1543,9 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
1212
1543
 
1213
1544
 
1214
1545
  @_init_db
1215
- def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1216
- original_user_yaml_path: str, env_file_path: str,
1217
- user_hash: str, priority: int):
1546
+ def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
1547
+ original_user_yaml_content: str,
1548
+ env_file_content: str, priority: int):
1218
1549
  """Do not call without holding the scheduler lock.
1219
1550
 
1220
1551
  Returns: Whether this is a recovery run or not.
@@ -1226,20 +1557,48 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1226
1557
  assert _SQLALCHEMY_ENGINE is not None
1227
1558
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1228
1559
  updated_count = session.query(job_info_table).filter(
1229
- sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
1230
- ).update({
1231
- job_info_table.c.schedule_state:
1232
- ManagedJobScheduleState.WAITING.value,
1233
- job_info_table.c.dag_yaml_path: dag_yaml_path,
1234
- job_info_table.c.original_user_yaml_path: original_user_yaml_path,
1235
- job_info_table.c.env_file_path: env_file_path,
1236
- job_info_table.c.user_hash: user_hash,
1237
- job_info_table.c.priority: priority,
1238
- })
1560
+ sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)).update({
1561
+ job_info_table.c.schedule_state:
1562
+ ManagedJobScheduleState.WAITING.value,
1563
+ job_info_table.c.dag_yaml_content: dag_yaml_content,
1564
+ job_info_table.c.original_user_yaml_content:
1565
+ (original_user_yaml_content),
1566
+ job_info_table.c.env_file_content: env_file_content,
1567
+ job_info_table.c.priority: priority,
1568
+ })
1239
1569
  session.commit()
1240
1570
  assert updated_count <= 1, (job_id, updated_count)
1241
1571
 
1242
1572
 
1573
+ @_init_db
1574
+ def get_job_file_contents(job_id: int) -> Dict[str, Optional[str]]:
1575
+ """Return file information and stored contents for a managed job."""
1576
+ assert _SQLALCHEMY_ENGINE is not None
1577
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1578
+ row = session.execute(
1579
+ sqlalchemy.select(
1580
+ job_info_table.c.dag_yaml_path,
1581
+ job_info_table.c.env_file_path,
1582
+ job_info_table.c.dag_yaml_content,
1583
+ job_info_table.c.env_file_content,
1584
+ ).where(job_info_table.c.spot_job_id == job_id)).fetchone()
1585
+
1586
+ if row is None:
1587
+ return {
1588
+ 'dag_yaml_path': None,
1589
+ 'env_file_path': None,
1590
+ 'dag_yaml_content': None,
1591
+ 'env_file_content': None,
1592
+ }
1593
+
1594
+ return {
1595
+ 'dag_yaml_path': row[0],
1596
+ 'env_file_path': row[1],
1597
+ 'dag_yaml_content': row[2],
1598
+ 'env_file_content': row[3],
1599
+ }
1600
+
1601
+
1243
1602
  @_init_db
1244
1603
  def get_pool_from_job_id(job_id: int) -> Optional[str]:
1245
1604
  """Get the pool from the job id."""
@@ -1251,25 +1610,6 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
1251
1610
  return pool[0] if pool else None
1252
1611
 
1253
1612
 
1254
- @_init_db
1255
- def get_pool_and_submit_info_from_job_ids(
1256
- job_ids: Set[int]
1257
- ) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
1258
- """Get the pool, cluster name, and job id on pool from job id"""
1259
- assert _SQLALCHEMY_ENGINE is not None
1260
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1261
- rows = session.execute(
1262
- sqlalchemy.select(
1263
- job_info_table.c.spot_job_id, job_info_table.c.pool,
1264
- job_info_table.c.current_cluster_name,
1265
- job_info_table.c.job_id_on_pool_cluster).where(
1266
- job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
1267
- return {
1268
- job_id: (pool, cluster_name, job_id_on_pool_cluster)
1269
- for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
1270
- }
1271
-
1272
-
1273
1613
  @_init_db
1274
1614
  def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
1275
1615
  """Set the current cluster name for a job."""
@@ -1644,9 +1984,12 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
1644
1984
  count = result.rowcount
1645
1985
  await session.commit()
1646
1986
  if count != 1:
1647
- raise exceptions.ManagedJobStatusError(
1648
- 'Failed to set the task to starting. '
1649
- f'({count} rows updated)')
1987
+ details = await _describe_task_transition_failure(
1988
+ session, job_id, task_id)
1989
+ message = ('Failed to set the task to starting. '
1990
+ f'({count} rows updated. {details})')
1991
+ logger.error(message)
1992
+ raise exceptions.ManagedJobStatusError(message)
1650
1993
  await callback_func('SUBMITTED')
1651
1994
  await callback_func('STARTING')
1652
1995
 
@@ -1676,9 +2019,12 @@ async def set_started_async(job_id: int, task_id: int, start_time: float,
1676
2019
  count = result.rowcount
1677
2020
  await session.commit()
1678
2021
  if count != 1:
1679
- raise exceptions.ManagedJobStatusError(
1680
- f'Failed to set the task to started. '
1681
- f'({count} rows updated)')
2022
+ details = await _describe_task_transition_failure(
2023
+ session, job_id, task_id)
2024
+ message = (f'Failed to set the task to started. '
2025
+ f'({count} rows updated. {details})')
2026
+ logger.error(message)
2027
+ raise exceptions.ManagedJobStatusError(message)
1682
2028
  await callback_func('STARTED')
1683
2029
 
1684
2030
 
@@ -1733,9 +2079,14 @@ async def set_recovering_async(job_id: int, task_id: int,
1733
2079
  count = result.rowcount
1734
2080
  await session.commit()
1735
2081
  if count != 1:
1736
- raise exceptions.ManagedJobStatusError(
1737
- f'Failed to set the task to recovering. '
1738
- f'({count} rows updated)')
2082
+ details = await _describe_task_transition_failure(
2083
+ session, job_id, task_id)
2084
+ message = ('Failed to set the task to recovering with '
2085
+ 'force_transit_to_recovering='
2086
+ f'{force_transit_to_recovering}. '
2087
+ f'({count} rows updated. {details})')
2088
+ logger.error(message)
2089
+ raise exceptions.ManagedJobStatusError(message)
1739
2090
  await callback_func('RECOVERING')
1740
2091
 
1741
2092
 
@@ -1761,9 +2112,12 @@ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
1761
2112
  count = result.rowcount
1762
2113
  await session.commit()
1763
2114
  if count != 1:
1764
- raise exceptions.ManagedJobStatusError(
1765
- f'Failed to set the task to recovered. '
1766
- f'({count} rows updated)')
2115
+ details = await _describe_task_transition_failure(
2116
+ session, job_id, task_id)
2117
+ message = (f'Failed to set the task to recovered. '
2118
+ f'({count} rows updated. {details})')
2119
+ logger.error(message)
2120
+ raise exceptions.ManagedJobStatusError(message)
1767
2121
  logger.info('==== Recovered. ====')
1768
2122
  await callback_func('RECOVERED')
1769
2123
 
@@ -1788,9 +2142,12 @@ async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
1788
2142
  count = result.rowcount
1789
2143
  await session.commit()
1790
2144
  if count != 1:
1791
- raise exceptions.ManagedJobStatusError(
1792
- f'Failed to set the task to succeeded. '
1793
- f'({count} rows updated)')
2145
+ details = await _describe_task_transition_failure(
2146
+ session, job_id, task_id)
2147
+ message = (f'Failed to set the task to succeeded. '
2148
+ f'({count} rows updated. {details})')
2149
+ logger.error(message)
2150
+ raise exceptions.ManagedJobStatusError(message)
1794
2151
  await callback_func('SUCCEEDED')
1795
2152
  logger.info('Job succeeded.')
1796
2153
 
@@ -1956,8 +2313,13 @@ async def scheduler_set_done_async(job_id: int,
1956
2313
 
1957
2314
 
1958
2315
  @_init_db
1959
- def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
1960
- pool: Optional[str], pool_hash: Optional[str]):
2316
+ def set_job_info(job_id: int,
2317
+ name: str,
2318
+ workspace: str,
2319
+ entrypoint: str,
2320
+ pool: Optional[str],
2321
+ pool_hash: Optional[str],
2322
+ user_hash: Optional[str] = None):
1961
2323
  assert _SQLALCHEMY_ENGINE is not None
1962
2324
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1963
2325
  if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -1976,6 +2338,7 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
1976
2338
  entrypoint=entrypoint,
1977
2339
  pool=pool,
1978
2340
  pool_hash=pool_hash,
2341
+ user_hash=user_hash,
1979
2342
  )
1980
2343
  session.execute(insert_stmt)
1981
2344
  session.commit()
@@ -2029,3 +2392,118 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
2029
2392
  rows = session.execute(query).fetchall()
2030
2393
  job_ids = [row[0] for row in rows if row[0] is not None]
2031
2394
  return job_ids
2395
+
2396
+
2397
+ @_init_db_async
2398
+ async def get_task_logs_to_clean_async(retention_seconds: int,
2399
+ batch_size) -> List[Dict[str, Any]]:
2400
+ """Get the logs of job tasks to clean.
2401
+
2402
+ The logs of a task will only cleaned when:
2403
+ - the job schedule state is DONE
2404
+ - AND the end time of the task is older than the retention period
2405
+ """
2406
+
2407
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2408
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2409
+ now = time.time()
2410
+ result = await session.execute(
2411
+ sqlalchemy.select(
2412
+ spot_table.c.spot_job_id,
2413
+ spot_table.c.task_id,
2414
+ spot_table.c.local_log_file,
2415
+ ).select_from(
2416
+ spot_table.join(
2417
+ job_info_table,
2418
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id,
2419
+ )).
2420
+ where(
2421
+ sqlalchemy.and_(
2422
+ job_info_table.c.schedule_state.is_(
2423
+ ManagedJobScheduleState.DONE.value),
2424
+ spot_table.c.end_at.isnot(None),
2425
+ spot_table.c.end_at < (now - retention_seconds),
2426
+ spot_table.c.logs_cleaned_at.is_(None),
2427
+ # The local log file is set AFTER the task is finished,
2428
+ # add this condition to ensure the entire log file has
2429
+ # been written.
2430
+ spot_table.c.local_log_file.isnot(None),
2431
+ )).limit(batch_size))
2432
+ rows = result.fetchall()
2433
+ return [{
2434
+ 'job_id': row[0],
2435
+ 'task_id': row[1],
2436
+ 'local_log_file': row[2]
2437
+ } for row in rows]
2438
+
2439
+
2440
+ @_init_db_async
2441
+ async def get_controller_logs_to_clean_async(
2442
+ retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
2443
+ """Get the controller logs to clean.
2444
+
2445
+ The controller logs will only cleaned when:
2446
+ - the job schedule state is DONE
2447
+ - AND the end time of the latest task is older than the retention period
2448
+ """
2449
+
2450
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2451
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2452
+ now = time.time()
2453
+
2454
+ result = await session.execute(
2455
+ sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
2456
+ job_info_table.join(
2457
+ spot_table,
2458
+ job_info_table.c.spot_job_id == spot_table.c.spot_job_id,
2459
+ )).where(
2460
+ sqlalchemy.and_(
2461
+ job_info_table.c.schedule_state.is_(
2462
+ ManagedJobScheduleState.DONE.value),
2463
+ spot_table.c.local_log_file.isnot(None),
2464
+ job_info_table.c.controller_logs_cleaned_at.is_(None),
2465
+ )).group_by(
2466
+ job_info_table.c.spot_job_id,
2467
+ job_info_table.c.current_cluster_name,
2468
+ ).having(
2469
+ sqlalchemy.func.max(
2470
+ spot_table.c.end_at).isnot(None),).having(
2471
+ sqlalchemy.func.max(spot_table.c.end_at) < (
2472
+ now - retention_seconds)).limit(batch_size))
2473
+ rows = result.fetchall()
2474
+ return [{'job_id': row[0]} for row in rows]
2475
+
2476
+
2477
+ @_init_db_async
2478
+ async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
2479
+ logs_cleaned_at: float):
2480
+ """Set the task logs cleaned at."""
2481
+ if not tasks:
2482
+ return
2483
+ # Deduplicate
2484
+ task_keys = list(dict.fromkeys(tasks))
2485
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2486
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2487
+ await session.execute(
2488
+ sqlalchemy.update(spot_table).where(
2489
+ sqlalchemy.tuple_(spot_table.c.spot_job_id,
2490
+ spot_table.c.task_id).in_(task_keys)).values(
2491
+ logs_cleaned_at=logs_cleaned_at))
2492
+ await session.commit()
2493
+
2494
+
2495
+ @_init_db_async
2496
+ async def set_controller_logs_cleaned_async(job_ids: List[int],
2497
+ logs_cleaned_at: float):
2498
+ """Set the controller logs cleaned at."""
2499
+ if not job_ids:
2500
+ return
2501
+ # Deduplicate
2502
+ job_ids = list(dict.fromkeys(job_ids))
2503
+ assert _SQLALCHEMY_ENGINE_ASYNC is not None
2504
+ async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
2505
+ await session.execute(
2506
+ sqlalchemy.update(job_info_table).where(
2507
+ job_info_table.c.spot_job_id.in_(job_ids)).values(
2508
+ controller_logs_cleaned_at=logs_cleaned_at))
2509
+ await session.commit()