skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +64 -0
  3. sky/backends/backend_utils.py +11 -11
  4. sky/backends/cloud_vm_ray_backend.py +15 -4
  5. sky/client/cli/command.py +39 -10
  6. sky/client/cli/flags.py +4 -2
  7. sky/client/sdk.py +26 -3
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
  10. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  11. sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
  12. sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  16. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  29. sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/config.html +1 -1
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/infra/[context].html +1 -1
  36. sky/dashboard/out/infra.html +1 -1
  37. sky/dashboard/out/jobs/[job].html +1 -1
  38. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  39. sky/dashboard/out/jobs.html +1 -1
  40. sky/dashboard/out/users.html +1 -1
  41. sky/dashboard/out/volumes.html +1 -1
  42. sky/dashboard/out/workspace/new.html +1 -1
  43. sky/dashboard/out/workspaces/[name].html +1 -1
  44. sky/dashboard/out/workspaces.html +1 -1
  45. sky/data/storage.py +2 -2
  46. sky/global_user_state.py +137 -37
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +4 -2
  49. sky/jobs/server/server.py +21 -12
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +248 -144
  52. sky/provision/kubernetes/network.py +9 -6
  53. sky/provision/provisioner.py +8 -0
  54. sky/schemas/api/responses.py +2 -0
  55. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  56. sky/serve/server/server.py +8 -7
  57. sky/server/common.py +10 -15
  58. sky/server/constants.py +1 -1
  59. sky/server/daemons.py +4 -2
  60. sky/server/requests/executor.py +30 -28
  61. sky/server/requests/payloads.py +5 -1
  62. sky/server/requests/preconditions.py +9 -4
  63. sky/server/requests/requests.py +130 -53
  64. sky/server/requests/serializers/encoders.py +3 -3
  65. sky/server/server.py +91 -58
  66. sky/server/stream_utils.py +127 -38
  67. sky/server/uvicorn.py +18 -17
  68. sky/setup_files/alembic.ini +4 -0
  69. sky/skylet/services.py +5 -5
  70. sky/skypilot_config.py +87 -75
  71. sky/ssh_node_pools/server.py +4 -4
  72. sky/users/permission.py +4 -0
  73. sky/utils/asyncio_utils.py +63 -3
  74. sky/utils/db/db_utils.py +11 -3
  75. sky/utils/db/migration_utils.py +7 -3
  76. sky/volumes/server/server.py +3 -3
  77. sky/workspaces/server.py +6 -6
  78. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
  79. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
  80. sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
  81. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  82. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  83. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  87. sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
  88. /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
  89. /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
  90. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  91. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  92. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
  93. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
  94. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
  95. {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/jobs/server/server.py CHANGED
@@ -35,7 +35,7 @@ async def launch(request: fastapi.Request,
35
35
  consolidation_mode = managed_jobs_utils.is_consolidation_mode()
36
36
  schedule_type = (api_requests.ScheduleType.SHORT
37
37
  if consolidation_mode else api_requests.ScheduleType.LONG)
38
- executor.schedule_request(
38
+ await executor.schedule_request_async(
39
39
  request_id=request.state.request_id,
40
40
  request_name='jobs.launch',
41
41
  request_body=jobs_launch_body,
@@ -50,7 +50,7 @@ async def launch(request: fastapi.Request,
50
50
  @router.post('/queue')
51
51
  async def queue(request: fastapi.Request,
52
52
  jobs_queue_body: payloads.JobsQueueBody) -> None:
53
- executor.schedule_request(
53
+ await executor.schedule_request_async(
54
54
  request_id=request.state.request_id,
55
55
  request_name='jobs.queue',
56
56
  request_body=jobs_queue_body,
@@ -64,7 +64,7 @@ async def queue(request: fastapi.Request,
64
64
  @router.post('/queue/v2')
65
65
  async def queue_v2(request: fastapi.Request,
66
66
  jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
67
- executor.schedule_request(
67
+ await executor.schedule_request_async(
68
68
  request_id=request.state.request_id,
69
69
  request_name='jobs.queue_v2',
70
70
  request_body=jobs_queue_body_v2,
@@ -79,7 +79,7 @@ async def queue_v2(request: fastapi.Request,
79
79
  @router.post('/cancel')
80
80
  async def cancel(request: fastapi.Request,
81
81
  jobs_cancel_body: payloads.JobsCancelBody) -> None:
82
- executor.schedule_request(
82
+ await executor.schedule_request_async(
83
83
  request_id=request.state.request_id,
84
84
  request_name='jobs.cancel',
85
85
  request_body=jobs_cancel_body,
@@ -101,7 +101,7 @@ async def logs(
101
101
  schedule_type = api_requests.ScheduleType.LONG
102
102
  if schedule_type == api_requests.ScheduleType.SHORT:
103
103
  executor.check_request_thread_executor_available()
104
- request_task = executor.prepare_request(
104
+ request_task = await executor.prepare_request_async(
105
105
  request_id=request.state.request_id,
106
106
  request_name='jobs.logs',
107
107
  request_body=jobs_logs_body,
@@ -109,6 +109,7 @@ async def logs(
109
109
  schedule_type=schedule_type,
110
110
  request_cluster_name=common.JOB_CONTROLLER_NAME,
111
111
  )
112
+ kill_request_on_disconnect = False
112
113
  if schedule_type == api_requests.ScheduleType.SHORT:
113
114
  # For short request, run in the coroutine to avoid blocking
114
115
  # short workers.
@@ -117,11 +118,15 @@ async def logs(
117
118
  background_tasks.add_task(task.cancel)
118
119
  else:
119
120
  executor.schedule_prepared_request(request_task)
121
+ # When runs in long executor process, we should kill the request on
122
+ # disconnect to cancel the running routine.
123
+ kill_request_on_disconnect = True
120
124
 
121
125
  return stream_utils.stream_response_for_long_request(
122
126
  request_id=request_task.request_id,
123
127
  logs_path=request_task.log_path,
124
128
  background_tasks=background_tasks,
129
+ kill_request_on_disconnect=kill_request_on_disconnect,
125
130
  )
126
131
 
127
132
 
@@ -136,7 +141,7 @@ async def download_logs(
136
141
  # We should reuse the original request body, so that the env vars, such as
137
142
  # user hash, are kept the same.
138
143
  jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
139
- executor.schedule_request(
144
+ await executor.schedule_request_async(
140
145
  request_id=request.state.request_id,
141
146
  request_name='jobs.download_logs',
142
147
  request_body=jobs_download_logs_body,
@@ -150,7 +155,7 @@ async def download_logs(
150
155
  @router.post('/pool_apply')
151
156
  async def pool_apply(request: fastapi.Request,
152
157
  jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
153
- executor.schedule_request(
158
+ await executor.schedule_request_async(
154
159
  request_id=request.state.request_id,
155
160
  request_name='jobs.pool_apply',
156
161
  request_body=jobs_pool_apply_body,
@@ -163,7 +168,7 @@ async def pool_apply(request: fastapi.Request,
163
168
  @router.post('/pool_down')
164
169
  async def pool_down(request: fastapi.Request,
165
170
  jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
166
- executor.schedule_request(
171
+ await executor.schedule_request_async(
167
172
  request_id=request.state.request_id,
168
173
  request_name='jobs.pool_down',
169
174
  request_body=jobs_pool_down_body,
@@ -177,7 +182,7 @@ async def pool_down(request: fastapi.Request,
177
182
  async def pool_status(
178
183
  request: fastapi.Request,
179
184
  jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
180
- executor.schedule_request(
185
+ await executor.schedule_request_async(
181
186
  request_id=request.state.request_id,
182
187
  request_name='jobs.pool_status',
183
188
  request_body=jobs_pool_status_body,
@@ -192,7 +197,7 @@ async def pool_tail_logs(
192
197
  request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
193
198
  background_tasks: fastapi.BackgroundTasks
194
199
  ) -> fastapi.responses.StreamingResponse:
195
- executor.schedule_request(
200
+ await executor.schedule_request_async(
196
201
  request_id=request.state.request_id,
197
202
  request_name='jobs.pool_logs',
198
203
  request_body=log_body,
@@ -201,12 +206,16 @@ async def pool_tail_logs(
201
206
  request_cluster_name=common.JOB_CONTROLLER_NAME,
202
207
  )
203
208
 
204
- request_task = api_requests.get_request(request.state.request_id)
209
+ request_task = api_requests.get_request(request.state.request_id,
210
+ fields=['request_id'])
205
211
 
206
212
  return stream_utils.stream_response_for_long_request(
207
213
  request_id=request_task.request_id,
214
+ # req.log_path is derived from request_id,
215
+ # so it's ok to just grab the request_id in the above query.
208
216
  logs_path=request_task.log_path,
209
217
  background_tasks=background_tasks,
218
+ kill_request_on_disconnect=True,
210
219
  )
211
220
 
212
221
 
@@ -224,7 +233,7 @@ async def pool_download_logs(
224
233
  # We should reuse the original request body, so that the env vars, such as
225
234
  # user hash, are kept the same.
226
235
  download_logs_body.local_dir = str(logs_dir_on_api_server)
227
- executor.schedule_request(
236
+ await executor.schedule_request_async(
228
237
  request_id=request.state.request_id,
229
238
  request_name='jobs.pool_sync_down_logs',
230
239
  request_body=download_logs_body,
sky/jobs/state.py CHANGED
@@ -10,8 +10,7 @@ import sqlite3
10
10
  import threading
11
11
  import time
12
12
  import typing
13
- from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
14
- Union)
13
+ from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
15
14
  import urllib.parse
16
15
 
17
16
  import colorama
@@ -315,41 +314,42 @@ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
315
314
  # by joining the spot and job_info tables.
316
315
  def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
317
316
  return {
318
- '_job_id': r['job_id'], # from spot table
319
- '_task_name': r['job_name'], # deprecated, from spot table
320
- 'resources': r['resources'],
321
- 'submitted_at': r['submitted_at'],
322
- 'status': r['status'],
323
- 'run_timestamp': r['run_timestamp'],
324
- 'start_at': r['start_at'],
325
- 'end_at': r['end_at'],
326
- 'last_recovered_at': r['last_recovered_at'],
327
- 'recovery_count': r['recovery_count'],
328
- 'job_duration': r['job_duration'],
329
- 'failure_reason': r['failure_reason'],
330
- 'job_id': r[spot_table.c.spot_job_id], # ambiguous, use table.column
331
- 'task_id': r['task_id'],
332
- 'task_name': r['task_name'],
333
- 'specs': r['specs'],
334
- 'local_log_file': r['local_log_file'],
335
- 'metadata': r['metadata'],
317
+ '_job_id': r.get('job_id'), # from spot table
318
+ '_task_name': r.get('job_name'), # deprecated, from spot table
319
+ 'resources': r.get('resources'),
320
+ 'submitted_at': r.get('submitted_at'),
321
+ 'status': r.get('status'),
322
+ 'run_timestamp': r.get('run_timestamp'),
323
+ 'start_at': r.get('start_at'),
324
+ 'end_at': r.get('end_at'),
325
+ 'last_recovered_at': r.get('last_recovered_at'),
326
+ 'recovery_count': r.get('recovery_count'),
327
+ 'job_duration': r.get('job_duration'),
328
+ 'failure_reason': r.get('failure_reason'),
329
+ 'job_id': r.get(spot_table.c.spot_job_id
330
+ ), # ambiguous, use table.column
331
+ 'task_id': r.get('task_id'),
332
+ 'task_name': r.get('task_name'),
333
+ 'specs': r.get('specs'),
334
+ 'local_log_file': r.get('local_log_file'),
335
+ 'metadata': r.get('metadata'),
336
336
  # columns from job_info table (some may be None for legacy jobs)
337
- '_job_info_job_id': r[job_info_table.c.spot_job_id
338
- ], # ambiguous, use table.column
339
- 'job_name': r['name'], # from job_info table
340
- 'schedule_state': r['schedule_state'],
341
- 'controller_pid': r['controller_pid'],
342
- 'dag_yaml_path': r['dag_yaml_path'],
343
- 'env_file_path': r['env_file_path'],
344
- 'user_hash': r['user_hash'],
345
- 'workspace': r['workspace'],
346
- 'priority': r['priority'],
347
- 'entrypoint': r['entrypoint'],
348
- 'original_user_yaml_path': r['original_user_yaml_path'],
349
- 'pool': r['pool'],
350
- 'current_cluster_name': r['current_cluster_name'],
351
- 'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
352
- 'pool_hash': r['pool_hash'],
337
+ '_job_info_job_id': r.get(job_info_table.c.spot_job_id
338
+ ), # ambiguous, use table.column
339
+ 'job_name': r.get('name'), # from job_info table
340
+ 'schedule_state': r.get('schedule_state'),
341
+ 'controller_pid': r.get('controller_pid'),
342
+ 'dag_yaml_path': r.get('dag_yaml_path'),
343
+ 'env_file_path': r.get('env_file_path'),
344
+ 'user_hash': r.get('user_hash'),
345
+ 'workspace': r.get('workspace'),
346
+ 'priority': r.get('priority'),
347
+ 'entrypoint': r.get('entrypoint'),
348
+ 'original_user_yaml_path': r.get('original_user_yaml_path'),
349
+ 'pool': r.get('pool'),
350
+ 'current_cluster_name': r.get('current_cluster_name'),
351
+ 'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
352
+ 'pool_hash': r.get('pool_hash'),
353
353
  }
354
354
 
355
355
 
@@ -1200,6 +1200,277 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1200
1200
  return jobs
1201
1201
 
1202
1202
 
1203
+ def _map_response_field_to_db_column(field: str):
1204
+ """Map the response field name to an actual SQLAlchemy ColumnElement.
1205
+
1206
+ This ensures we never pass plain strings to SQLAlchemy 2.0 APIs like
1207
+ Select.with_only_columns().
1208
+ """
1209
+ # Explicit aliases differing from actual DB column names
1210
+ alias_mapping = {
1211
+ '_job_id': spot_table.c.job_id, # spot.job_id
1212
+ '_task_name': spot_table.c.job_name, # deprecated, from spot table
1213
+ 'job_id': spot_table.c.spot_job_id, # public job id -> spot.spot_job_id
1214
+ '_job_info_job_id': job_info_table.c.spot_job_id,
1215
+ 'job_name': job_info_table.c.name, # public job name -> job_info.name
1216
+ }
1217
+ if field in alias_mapping:
1218
+ return alias_mapping[field]
1219
+
1220
+ # Try direct match on the `spot` table columns
1221
+ if field in spot_table.c:
1222
+ return spot_table.c[field]
1223
+
1224
+ # Try direct match on the `job_info` table columns
1225
+ if field in job_info_table.c:
1226
+ return job_info_table.c[field]
1227
+
1228
+ raise ValueError(f'Unknown field: {field}')
1229
+
1230
+
1231
+ @_init_db
1232
+ def get_managed_jobs_total() -> int:
1233
+ """Get the total number of managed jobs."""
1234
+ assert _SQLALCHEMY_ENGINE is not None
1235
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1236
+ result = session.execute(
1237
+ sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
1238
+ ).select_from(spot_table)).fetchone()
1239
+ return result[0] if result else 0
1240
+
1241
+
1242
+ @_init_db
1243
+ def get_managed_jobs_highest_priority() -> int:
1244
+ """Get the highest priority of the managed jobs."""
1245
+ assert _SQLALCHEMY_ENGINE is not None
1246
+ query = sqlalchemy.select(sqlalchemy.func.max(
1247
+ job_info_table.c.priority)).where(
1248
+ sqlalchemy.and_(
1249
+ job_info_table.c.schedule_state.in_([
1250
+ ManagedJobScheduleState.LAUNCHING.value,
1251
+ ManagedJobScheduleState.ALIVE_BACKOFF.value,
1252
+ ManagedJobScheduleState.WAITING.value,
1253
+ ManagedJobScheduleState.ALIVE_WAITING.value,
1254
+ ]),
1255
+ job_info_table.c.priority.is_not(None),
1256
+ ))
1257
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1258
+ priority = session.execute(query).fetchone()
1259
+ return priority[0] if priority and priority[
1260
+ 0] is not None else constants.MIN_PRIORITY
1261
+
1262
+
1263
+ def build_managed_jobs_with_filters_no_status_query(
1264
+ fields: Optional[List[str]] = None,
1265
+ job_ids: Optional[List[int]] = None,
1266
+ accessible_workspaces: Optional[List[str]] = None,
1267
+ workspace_match: Optional[str] = None,
1268
+ name_match: Optional[str] = None,
1269
+ pool_match: Optional[str] = None,
1270
+ user_hashes: Optional[List[Optional[str]]] = None,
1271
+ skip_finished: bool = False,
1272
+ count_only: bool = False,
1273
+ status_count: bool = False,
1274
+ ) -> sqlalchemy.Select:
1275
+ """Build a query to get managed jobs from the database with filters."""
1276
+ # Join spot and job_info tables to get the job name for each task.
1277
+ # We use LEFT OUTER JOIN mainly for backward compatibility, as for an
1278
+ # existing controller before #1982, the job_info table may not exist,
1279
+ # and all the managed jobs created before will not present in the
1280
+ # job_info.
1281
+ # Note: we will get the user_hash here, but don't try to call
1282
+ # global_user_state.get_user() on it. This runs on the controller, which may
1283
+ # not have the user info. Prefer to do it on the API server side.
1284
+ if count_only:
1285
+ query = sqlalchemy.select(sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
1286
+ elif status_count:
1287
+ query = sqlalchemy.select(spot_table.c.status,
1288
+ sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
1289
+ else:
1290
+ query = sqlalchemy.select(spot_table, job_info_table)
1291
+ query = query.select_from(
1292
+ spot_table.outerjoin(
1293
+ job_info_table,
1294
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1295
+ if skip_finished:
1296
+ # Filter out finished jobs at the DB level. If a multi-task job is
1297
+ # partially finished, include all its tasks. We do this by first
1298
+ # selecting job_ids that have at least one non-terminal task, then
1299
+ # restricting the main query to those job_ids.
1300
+ terminal_status_values = [
1301
+ s.value for s in ManagedJobStatus.terminal_statuses()
1302
+ ]
1303
+ non_terminal_job_ids_subquery = (sqlalchemy.select(
1304
+ spot_table.c.spot_job_id).where(
1305
+ sqlalchemy.or_(
1306
+ spot_table.c.status.is_(None),
1307
+ sqlalchemy.not_(
1308
+ spot_table.c.status.in_(terminal_status_values)),
1309
+ )).distinct())
1310
+ query = query.where(
1311
+ spot_table.c.spot_job_id.in_(non_terminal_job_ids_subquery))
1312
+ if not count_only and not status_count and fields:
1313
+ # Resolve requested field names to explicit ColumnElements from
1314
+ # the joined tables.
1315
+ selected_columns = [_map_response_field_to_db_column(f) for f in fields]
1316
+ query = query.with_only_columns(*selected_columns)
1317
+ if job_ids is not None:
1318
+ query = query.where(spot_table.c.spot_job_id.in_(job_ids))
1319
+ if accessible_workspaces is not None:
1320
+ query = query.where(
1321
+ job_info_table.c.workspace.in_(accessible_workspaces))
1322
+ if workspace_match is not None:
1323
+ query = query.where(
1324
+ job_info_table.c.workspace.like(f'%{workspace_match}%'))
1325
+ if name_match is not None:
1326
+ query = query.where(job_info_table.c.name.like(f'%{name_match}%'))
1327
+ if pool_match is not None:
1328
+ query = query.where(job_info_table.c.pool.like(f'%{pool_match}%'))
1329
+ if user_hashes is not None:
1330
+ query = query.where(job_info_table.c.user_hash.in_(user_hashes))
1331
+ return query
1332
+
1333
+
1334
+ def build_managed_jobs_with_filters_query(
1335
+ fields: Optional[List[str]] = None,
1336
+ job_ids: Optional[List[int]] = None,
1337
+ accessible_workspaces: Optional[List[str]] = None,
1338
+ workspace_match: Optional[str] = None,
1339
+ name_match: Optional[str] = None,
1340
+ pool_match: Optional[str] = None,
1341
+ user_hashes: Optional[List[Optional[str]]] = None,
1342
+ statuses: Optional[List[str]] = None,
1343
+ skip_finished: bool = False,
1344
+ count_only: bool = False,
1345
+ ) -> sqlalchemy.Select:
1346
+ """Build a query to get managed jobs from the database with filters."""
1347
+ query = build_managed_jobs_with_filters_no_status_query(
1348
+ fields=fields,
1349
+ job_ids=job_ids,
1350
+ accessible_workspaces=accessible_workspaces,
1351
+ workspace_match=workspace_match,
1352
+ name_match=name_match,
1353
+ pool_match=pool_match,
1354
+ user_hashes=user_hashes,
1355
+ skip_finished=skip_finished,
1356
+ count_only=count_only,
1357
+ )
1358
+ if statuses is not None:
1359
+ query = query.where(spot_table.c.status.in_(statuses))
1360
+ return query
1361
+
1362
+
1363
+ @_init_db
1364
+ def get_status_count_with_filters(
1365
+ fields: Optional[List[str]] = None,
1366
+ job_ids: Optional[List[int]] = None,
1367
+ accessible_workspaces: Optional[List[str]] = None,
1368
+ workspace_match: Optional[str] = None,
1369
+ name_match: Optional[str] = None,
1370
+ pool_match: Optional[str] = None,
1371
+ user_hashes: Optional[List[Optional[str]]] = None,
1372
+ skip_finished: bool = False,
1373
+ ) -> Dict[str, int]:
1374
+ """Get the status count of the managed jobs with filters."""
1375
+ query = build_managed_jobs_with_filters_no_status_query(
1376
+ fields=fields,
1377
+ job_ids=job_ids,
1378
+ accessible_workspaces=accessible_workspaces,
1379
+ workspace_match=workspace_match,
1380
+ name_match=name_match,
1381
+ pool_match=pool_match,
1382
+ user_hashes=user_hashes,
1383
+ skip_finished=skip_finished,
1384
+ status_count=True,
1385
+ )
1386
+ query = query.group_by(spot_table.c.status)
1387
+ results: Dict[str, int] = {}
1388
+ assert _SQLALCHEMY_ENGINE is not None
1389
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1390
+ rows = session.execute(query).fetchall()
1391
+ for status_value, count in rows:
1392
+ # status_value is already a string (enum value)
1393
+ results[str(status_value)] = int(count)
1394
+ return results
1395
+
1396
+
1397
+ @_init_db
1398
+ def get_managed_jobs_with_filters(
1399
+ fields: Optional[List[str]] = None,
1400
+ job_ids: Optional[List[int]] = None,
1401
+ accessible_workspaces: Optional[List[str]] = None,
1402
+ workspace_match: Optional[str] = None,
1403
+ name_match: Optional[str] = None,
1404
+ pool_match: Optional[str] = None,
1405
+ user_hashes: Optional[List[Optional[str]]] = None,
1406
+ statuses: Optional[List[str]] = None,
1407
+ skip_finished: bool = False,
1408
+ page: Optional[int] = None,
1409
+ limit: Optional[int] = None,
1410
+ ) -> Tuple[List[Dict[str, Any]], int]:
1411
+ """Get managed jobs from the database with filters."""
1412
+ assert _SQLALCHEMY_ENGINE is not None
1413
+
1414
+ count_query = build_managed_jobs_with_filters_query(
1415
+ fields=None,
1416
+ job_ids=job_ids,
1417
+ accessible_workspaces=accessible_workspaces,
1418
+ workspace_match=workspace_match,
1419
+ name_match=name_match,
1420
+ pool_match=pool_match,
1421
+ user_hashes=user_hashes,
1422
+ statuses=statuses,
1423
+ skip_finished=skip_finished,
1424
+ count_only=True,
1425
+ )
1426
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1427
+ total = session.execute(count_query).fetchone()[0]
1428
+
1429
+ query = build_managed_jobs_with_filters_query(
1430
+ fields=fields,
1431
+ job_ids=job_ids,
1432
+ accessible_workspaces=accessible_workspaces,
1433
+ workspace_match=workspace_match,
1434
+ name_match=name_match,
1435
+ pool_match=pool_match,
1436
+ user_hashes=user_hashes,
1437
+ statuses=statuses,
1438
+ skip_finished=skip_finished,
1439
+ )
1440
+ query = query.order_by(spot_table.c.spot_job_id.desc(),
1441
+ spot_table.c.task_id.asc())
1442
+ if page is not None and limit is not None:
1443
+ query = query.offset((page - 1) * limit).limit(limit)
1444
+ rows = None
1445
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1446
+ rows = session.execute(query).fetchall()
1447
+ jobs = []
1448
+ for row in rows:
1449
+ job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1450
+ job_dict['status'] = ManagedJobStatus(job_dict['status'])
1451
+ if job_dict.get('schedule_state') is not None:
1452
+ job_dict['schedule_state'] = ManagedJobScheduleState(
1453
+ job_dict['schedule_state'])
1454
+ if job_dict.get('job_name') is None:
1455
+ job_dict['job_name'] = job_dict.get('task_name')
1456
+ if job_dict.get('metadata') is not None:
1457
+ job_dict['metadata'] = json.loads(job_dict['metadata'])
1458
+
1459
+ # Add user YAML content for managed jobs.
1460
+ yaml_path = job_dict.get('original_user_yaml_path')
1461
+ if (not fields or 'user_yaml' in fields) and yaml_path:
1462
+ try:
1463
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1464
+ job_dict['user_yaml'] = f.read()
1465
+ except (FileNotFoundError, IOError, OSError):
1466
+ job_dict['user_yaml'] = None
1467
+ else:
1468
+ job_dict['user_yaml'] = None
1469
+
1470
+ jobs.append(job_dict)
1471
+ return jobs, total
1472
+
1473
+
1203
1474
  @_init_db
1204
1475
  def get_task_name(job_id: int, task_id: int) -> str:
1205
1476
  """Get the task name of a job."""
@@ -1278,25 +1549,6 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
1278
1549
  return pool[0] if pool else None
1279
1550
 
1280
1551
 
1281
- @_init_db
1282
- def get_pool_and_submit_info_from_job_ids(
1283
- job_ids: Set[int]
1284
- ) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
1285
- """Get the pool, cluster name, and job id on pool from job id"""
1286
- assert _SQLALCHEMY_ENGINE is not None
1287
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1288
- rows = session.execute(
1289
- sqlalchemy.select(
1290
- job_info_table.c.spot_job_id, job_info_table.c.pool,
1291
- job_info_table.c.current_cluster_name,
1292
- job_info_table.c.job_id_on_pool_cluster).where(
1293
- job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
1294
- return {
1295
- job_id: (pool, cluster_name, job_id_on_pool_cluster)
1296
- for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
1297
- }
1298
-
1299
-
1300
1552
  @_init_db
1301
1553
  def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
1302
1554
  """Set the current cluster name for a job."""