skypilot-nightly 1.0.0.dev20251019__py3-none-any.whl → 1.0.0.dev20251022__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/backends/backend_utils.py +11 -11
- sky/backends/cloud_vm_ray_backend.py +15 -4
- sky/client/cli/command.py +39 -10
- sky/client/cli/flags.py +4 -2
- sky/client/sdk.py +26 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/IgACOQPupLbX9z-RYVEDx/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/2755.9b1e69c921b5a870.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-d014dc5b9412fade.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.998db87cd52a1238.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.14326e329484b57e.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-919e3c01ab6b2633.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +2 -2
- sky/global_user_state.py +137 -37
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +4 -2
- sky/jobs/server/server.py +21 -12
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +248 -144
- sky/provision/kubernetes/network.py +9 -6
- sky/provision/provisioner.py +8 -0
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +8 -7
- sky/server/common.py +10 -15
- sky/server/constants.py +1 -1
- sky/server/daemons.py +4 -2
- sky/server/requests/executor.py +30 -28
- sky/server/requests/payloads.py +5 -1
- sky/server/requests/preconditions.py +9 -4
- sky/server/requests/requests.py +130 -53
- sky/server/requests/serializers/encoders.py +3 -3
- sky/server/server.py +91 -58
- sky/server/stream_utils.py +127 -38
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/services.py +5 -5
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/asyncio_utils.py +63 -3
- sky/utils/db/db_utils.py +11 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/METADATA +37 -37
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/RECORD +87 -86
- sky/dashboard/out/_next/static/8e35zdobdd0bK_Nkba03m/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-3c431f6c9086e487.js +0 -1
- /sky/dashboard/out/_next/static/{8e35zdobdd0bK_Nkba03m → IgACOQPupLbX9z-RYVEDx}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-df9f87fcb7f24292.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251019.dist-info → skypilot_nightly-1.0.0.dev20251022.dist-info}/top_level.txt +0 -0
sky/jobs/server/server.py
CHANGED
|
@@ -35,7 +35,7 @@ async def launch(request: fastapi.Request,
|
|
|
35
35
|
consolidation_mode = managed_jobs_utils.is_consolidation_mode()
|
|
36
36
|
schedule_type = (api_requests.ScheduleType.SHORT
|
|
37
37
|
if consolidation_mode else api_requests.ScheduleType.LONG)
|
|
38
|
-
executor.
|
|
38
|
+
await executor.schedule_request_async(
|
|
39
39
|
request_id=request.state.request_id,
|
|
40
40
|
request_name='jobs.launch',
|
|
41
41
|
request_body=jobs_launch_body,
|
|
@@ -50,7 +50,7 @@ async def launch(request: fastapi.Request,
|
|
|
50
50
|
@router.post('/queue')
|
|
51
51
|
async def queue(request: fastapi.Request,
|
|
52
52
|
jobs_queue_body: payloads.JobsQueueBody) -> None:
|
|
53
|
-
executor.
|
|
53
|
+
await executor.schedule_request_async(
|
|
54
54
|
request_id=request.state.request_id,
|
|
55
55
|
request_name='jobs.queue',
|
|
56
56
|
request_body=jobs_queue_body,
|
|
@@ -64,7 +64,7 @@ async def queue(request: fastapi.Request,
|
|
|
64
64
|
@router.post('/queue/v2')
|
|
65
65
|
async def queue_v2(request: fastapi.Request,
|
|
66
66
|
jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
|
|
67
|
-
executor.
|
|
67
|
+
await executor.schedule_request_async(
|
|
68
68
|
request_id=request.state.request_id,
|
|
69
69
|
request_name='jobs.queue_v2',
|
|
70
70
|
request_body=jobs_queue_body_v2,
|
|
@@ -79,7 +79,7 @@ async def queue_v2(request: fastapi.Request,
|
|
|
79
79
|
@router.post('/cancel')
|
|
80
80
|
async def cancel(request: fastapi.Request,
|
|
81
81
|
jobs_cancel_body: payloads.JobsCancelBody) -> None:
|
|
82
|
-
executor.
|
|
82
|
+
await executor.schedule_request_async(
|
|
83
83
|
request_id=request.state.request_id,
|
|
84
84
|
request_name='jobs.cancel',
|
|
85
85
|
request_body=jobs_cancel_body,
|
|
@@ -101,7 +101,7 @@ async def logs(
|
|
|
101
101
|
schedule_type = api_requests.ScheduleType.LONG
|
|
102
102
|
if schedule_type == api_requests.ScheduleType.SHORT:
|
|
103
103
|
executor.check_request_thread_executor_available()
|
|
104
|
-
request_task = executor.
|
|
104
|
+
request_task = await executor.prepare_request_async(
|
|
105
105
|
request_id=request.state.request_id,
|
|
106
106
|
request_name='jobs.logs',
|
|
107
107
|
request_body=jobs_logs_body,
|
|
@@ -109,6 +109,7 @@ async def logs(
|
|
|
109
109
|
schedule_type=schedule_type,
|
|
110
110
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
111
111
|
)
|
|
112
|
+
kill_request_on_disconnect = False
|
|
112
113
|
if schedule_type == api_requests.ScheduleType.SHORT:
|
|
113
114
|
# For short request, run in the coroutine to avoid blocking
|
|
114
115
|
# short workers.
|
|
@@ -117,11 +118,15 @@ async def logs(
|
|
|
117
118
|
background_tasks.add_task(task.cancel)
|
|
118
119
|
else:
|
|
119
120
|
executor.schedule_prepared_request(request_task)
|
|
121
|
+
# When runs in long executor process, we should kill the request on
|
|
122
|
+
# disconnect to cancel the running routine.
|
|
123
|
+
kill_request_on_disconnect = True
|
|
120
124
|
|
|
121
125
|
return stream_utils.stream_response_for_long_request(
|
|
122
126
|
request_id=request_task.request_id,
|
|
123
127
|
logs_path=request_task.log_path,
|
|
124
128
|
background_tasks=background_tasks,
|
|
129
|
+
kill_request_on_disconnect=kill_request_on_disconnect,
|
|
125
130
|
)
|
|
126
131
|
|
|
127
132
|
|
|
@@ -136,7 +141,7 @@ async def download_logs(
|
|
|
136
141
|
# We should reuse the original request body, so that the env vars, such as
|
|
137
142
|
# user hash, are kept the same.
|
|
138
143
|
jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
139
|
-
executor.
|
|
144
|
+
await executor.schedule_request_async(
|
|
140
145
|
request_id=request.state.request_id,
|
|
141
146
|
request_name='jobs.download_logs',
|
|
142
147
|
request_body=jobs_download_logs_body,
|
|
@@ -150,7 +155,7 @@ async def download_logs(
|
|
|
150
155
|
@router.post('/pool_apply')
|
|
151
156
|
async def pool_apply(request: fastapi.Request,
|
|
152
157
|
jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
|
|
153
|
-
executor.
|
|
158
|
+
await executor.schedule_request_async(
|
|
154
159
|
request_id=request.state.request_id,
|
|
155
160
|
request_name='jobs.pool_apply',
|
|
156
161
|
request_body=jobs_pool_apply_body,
|
|
@@ -163,7 +168,7 @@ async def pool_apply(request: fastapi.Request,
|
|
|
163
168
|
@router.post('/pool_down')
|
|
164
169
|
async def pool_down(request: fastapi.Request,
|
|
165
170
|
jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
|
|
166
|
-
executor.
|
|
171
|
+
await executor.schedule_request_async(
|
|
167
172
|
request_id=request.state.request_id,
|
|
168
173
|
request_name='jobs.pool_down',
|
|
169
174
|
request_body=jobs_pool_down_body,
|
|
@@ -177,7 +182,7 @@ async def pool_down(request: fastapi.Request,
|
|
|
177
182
|
async def pool_status(
|
|
178
183
|
request: fastapi.Request,
|
|
179
184
|
jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
|
|
180
|
-
executor.
|
|
185
|
+
await executor.schedule_request_async(
|
|
181
186
|
request_id=request.state.request_id,
|
|
182
187
|
request_name='jobs.pool_status',
|
|
183
188
|
request_body=jobs_pool_status_body,
|
|
@@ -192,7 +197,7 @@ async def pool_tail_logs(
|
|
|
192
197
|
request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
|
|
193
198
|
background_tasks: fastapi.BackgroundTasks
|
|
194
199
|
) -> fastapi.responses.StreamingResponse:
|
|
195
|
-
executor.
|
|
200
|
+
await executor.schedule_request_async(
|
|
196
201
|
request_id=request.state.request_id,
|
|
197
202
|
request_name='jobs.pool_logs',
|
|
198
203
|
request_body=log_body,
|
|
@@ -201,12 +206,16 @@ async def pool_tail_logs(
|
|
|
201
206
|
request_cluster_name=common.JOB_CONTROLLER_NAME,
|
|
202
207
|
)
|
|
203
208
|
|
|
204
|
-
request_task = api_requests.get_request(request.state.request_id
|
|
209
|
+
request_task = api_requests.get_request(request.state.request_id,
|
|
210
|
+
fields=['request_id'])
|
|
205
211
|
|
|
206
212
|
return stream_utils.stream_response_for_long_request(
|
|
207
213
|
request_id=request_task.request_id,
|
|
214
|
+
# req.log_path is derived from request_id,
|
|
215
|
+
# so it's ok to just grab the request_id in the above query.
|
|
208
216
|
logs_path=request_task.log_path,
|
|
209
217
|
background_tasks=background_tasks,
|
|
218
|
+
kill_request_on_disconnect=True,
|
|
210
219
|
)
|
|
211
220
|
|
|
212
221
|
|
|
@@ -224,7 +233,7 @@ async def pool_download_logs(
|
|
|
224
233
|
# We should reuse the original request body, so that the env vars, such as
|
|
225
234
|
# user hash, are kept the same.
|
|
226
235
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
227
|
-
executor.
|
|
236
|
+
await executor.schedule_request_async(
|
|
228
237
|
request_id=request.state.request_id,
|
|
229
238
|
request_name='jobs.pool_sync_down_logs',
|
|
230
239
|
request_body=download_logs_body,
|
sky/jobs/state.py
CHANGED
|
@@ -10,8 +10,7 @@ import sqlite3
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import
|
|
14
|
-
Union)
|
|
13
|
+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
|
|
15
14
|
import urllib.parse
|
|
16
15
|
|
|
17
16
|
import colorama
|
|
@@ -315,41 +314,42 @@ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
|
|
|
315
314
|
# by joining the spot and job_info tables.
|
|
316
315
|
def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
317
316
|
return {
|
|
318
|
-
'_job_id': r
|
|
319
|
-
'_task_name': r
|
|
320
|
-
'resources': r
|
|
321
|
-
'submitted_at': r
|
|
322
|
-
'status': r
|
|
323
|
-
'run_timestamp': r
|
|
324
|
-
'start_at': r
|
|
325
|
-
'end_at': r
|
|
326
|
-
'last_recovered_at': r
|
|
327
|
-
'recovery_count': r
|
|
328
|
-
'job_duration': r
|
|
329
|
-
'failure_reason': r
|
|
330
|
-
'job_id': r
|
|
331
|
-
|
|
332
|
-
'
|
|
333
|
-
'
|
|
334
|
-
'
|
|
335
|
-
'
|
|
317
|
+
'_job_id': r.get('job_id'), # from spot table
|
|
318
|
+
'_task_name': r.get('job_name'), # deprecated, from spot table
|
|
319
|
+
'resources': r.get('resources'),
|
|
320
|
+
'submitted_at': r.get('submitted_at'),
|
|
321
|
+
'status': r.get('status'),
|
|
322
|
+
'run_timestamp': r.get('run_timestamp'),
|
|
323
|
+
'start_at': r.get('start_at'),
|
|
324
|
+
'end_at': r.get('end_at'),
|
|
325
|
+
'last_recovered_at': r.get('last_recovered_at'),
|
|
326
|
+
'recovery_count': r.get('recovery_count'),
|
|
327
|
+
'job_duration': r.get('job_duration'),
|
|
328
|
+
'failure_reason': r.get('failure_reason'),
|
|
329
|
+
'job_id': r.get(spot_table.c.spot_job_id
|
|
330
|
+
), # ambiguous, use table.column
|
|
331
|
+
'task_id': r.get('task_id'),
|
|
332
|
+
'task_name': r.get('task_name'),
|
|
333
|
+
'specs': r.get('specs'),
|
|
334
|
+
'local_log_file': r.get('local_log_file'),
|
|
335
|
+
'metadata': r.get('metadata'),
|
|
336
336
|
# columns from job_info table (some may be None for legacy jobs)
|
|
337
|
-
'_job_info_job_id': r
|
|
338
|
-
|
|
339
|
-
'job_name': r
|
|
340
|
-
'schedule_state': r
|
|
341
|
-
'controller_pid': r
|
|
342
|
-
'dag_yaml_path': r
|
|
343
|
-
'env_file_path': r
|
|
344
|
-
'user_hash': r
|
|
345
|
-
'workspace': r
|
|
346
|
-
'priority': r
|
|
347
|
-
'entrypoint': r
|
|
348
|
-
'original_user_yaml_path': r
|
|
349
|
-
'pool': r
|
|
350
|
-
'current_cluster_name': r
|
|
351
|
-
'job_id_on_pool_cluster': r
|
|
352
|
-
'pool_hash': r
|
|
337
|
+
'_job_info_job_id': r.get(job_info_table.c.spot_job_id
|
|
338
|
+
), # ambiguous, use table.column
|
|
339
|
+
'job_name': r.get('name'), # from job_info table
|
|
340
|
+
'schedule_state': r.get('schedule_state'),
|
|
341
|
+
'controller_pid': r.get('controller_pid'),
|
|
342
|
+
'dag_yaml_path': r.get('dag_yaml_path'),
|
|
343
|
+
'env_file_path': r.get('env_file_path'),
|
|
344
|
+
'user_hash': r.get('user_hash'),
|
|
345
|
+
'workspace': r.get('workspace'),
|
|
346
|
+
'priority': r.get('priority'),
|
|
347
|
+
'entrypoint': r.get('entrypoint'),
|
|
348
|
+
'original_user_yaml_path': r.get('original_user_yaml_path'),
|
|
349
|
+
'pool': r.get('pool'),
|
|
350
|
+
'current_cluster_name': r.get('current_cluster_name'),
|
|
351
|
+
'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
|
|
352
|
+
'pool_hash': r.get('pool_hash'),
|
|
353
353
|
}
|
|
354
354
|
|
|
355
355
|
|
|
@@ -1200,6 +1200,277 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
1200
1200
|
return jobs
|
|
1201
1201
|
|
|
1202
1202
|
|
|
1203
|
+
def _map_response_field_to_db_column(field: str):
|
|
1204
|
+
"""Map the response field name to an actual SQLAlchemy ColumnElement.
|
|
1205
|
+
|
|
1206
|
+
This ensures we never pass plain strings to SQLAlchemy 2.0 APIs like
|
|
1207
|
+
Select.with_only_columns().
|
|
1208
|
+
"""
|
|
1209
|
+
# Explicit aliases differing from actual DB column names
|
|
1210
|
+
alias_mapping = {
|
|
1211
|
+
'_job_id': spot_table.c.job_id, # spot.job_id
|
|
1212
|
+
'_task_name': spot_table.c.job_name, # deprecated, from spot table
|
|
1213
|
+
'job_id': spot_table.c.spot_job_id, # public job id -> spot.spot_job_id
|
|
1214
|
+
'_job_info_job_id': job_info_table.c.spot_job_id,
|
|
1215
|
+
'job_name': job_info_table.c.name, # public job name -> job_info.name
|
|
1216
|
+
}
|
|
1217
|
+
if field in alias_mapping:
|
|
1218
|
+
return alias_mapping[field]
|
|
1219
|
+
|
|
1220
|
+
# Try direct match on the `spot` table columns
|
|
1221
|
+
if field in spot_table.c:
|
|
1222
|
+
return spot_table.c[field]
|
|
1223
|
+
|
|
1224
|
+
# Try direct match on the `job_info` table columns
|
|
1225
|
+
if field in job_info_table.c:
|
|
1226
|
+
return job_info_table.c[field]
|
|
1227
|
+
|
|
1228
|
+
raise ValueError(f'Unknown field: {field}')
|
|
1229
|
+
|
|
1230
|
+
|
|
1231
|
+
@_init_db
|
|
1232
|
+
def get_managed_jobs_total() -> int:
|
|
1233
|
+
"""Get the total number of managed jobs."""
|
|
1234
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1235
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1236
|
+
result = session.execute(
|
|
1237
|
+
sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
|
|
1238
|
+
).select_from(spot_table)).fetchone()
|
|
1239
|
+
return result[0] if result else 0
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
@_init_db
|
|
1243
|
+
def get_managed_jobs_highest_priority() -> int:
|
|
1244
|
+
"""Get the highest priority of the managed jobs."""
|
|
1245
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1246
|
+
query = sqlalchemy.select(sqlalchemy.func.max(
|
|
1247
|
+
job_info_table.c.priority)).where(
|
|
1248
|
+
sqlalchemy.and_(
|
|
1249
|
+
job_info_table.c.schedule_state.in_([
|
|
1250
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1251
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1252
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1253
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1254
|
+
]),
|
|
1255
|
+
job_info_table.c.priority.is_not(None),
|
|
1256
|
+
))
|
|
1257
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1258
|
+
priority = session.execute(query).fetchone()
|
|
1259
|
+
return priority[0] if priority and priority[
|
|
1260
|
+
0] is not None else constants.MIN_PRIORITY
|
|
1261
|
+
|
|
1262
|
+
|
|
1263
|
+
def build_managed_jobs_with_filters_no_status_query(
|
|
1264
|
+
fields: Optional[List[str]] = None,
|
|
1265
|
+
job_ids: Optional[List[int]] = None,
|
|
1266
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1267
|
+
workspace_match: Optional[str] = None,
|
|
1268
|
+
name_match: Optional[str] = None,
|
|
1269
|
+
pool_match: Optional[str] = None,
|
|
1270
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1271
|
+
skip_finished: bool = False,
|
|
1272
|
+
count_only: bool = False,
|
|
1273
|
+
status_count: bool = False,
|
|
1274
|
+
) -> sqlalchemy.Select:
|
|
1275
|
+
"""Build a query to get managed jobs from the database with filters."""
|
|
1276
|
+
# Join spot and job_info tables to get the job name for each task.
|
|
1277
|
+
# We use LEFT OUTER JOIN mainly for backward compatibility, as for an
|
|
1278
|
+
# existing controller before #1982, the job_info table may not exist,
|
|
1279
|
+
# and all the managed jobs created before will not present in the
|
|
1280
|
+
# job_info.
|
|
1281
|
+
# Note: we will get the user_hash here, but don't try to call
|
|
1282
|
+
# global_user_state.get_user() on it. This runs on the controller, which may
|
|
1283
|
+
# not have the user info. Prefer to do it on the API server side.
|
|
1284
|
+
if count_only:
|
|
1285
|
+
query = sqlalchemy.select(sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
|
|
1286
|
+
elif status_count:
|
|
1287
|
+
query = sqlalchemy.select(spot_table.c.status,
|
|
1288
|
+
sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
|
|
1289
|
+
else:
|
|
1290
|
+
query = sqlalchemy.select(spot_table, job_info_table)
|
|
1291
|
+
query = query.select_from(
|
|
1292
|
+
spot_table.outerjoin(
|
|
1293
|
+
job_info_table,
|
|
1294
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1295
|
+
if skip_finished:
|
|
1296
|
+
# Filter out finished jobs at the DB level. If a multi-task job is
|
|
1297
|
+
# partially finished, include all its tasks. We do this by first
|
|
1298
|
+
# selecting job_ids that have at least one non-terminal task, then
|
|
1299
|
+
# restricting the main query to those job_ids.
|
|
1300
|
+
terminal_status_values = [
|
|
1301
|
+
s.value for s in ManagedJobStatus.terminal_statuses()
|
|
1302
|
+
]
|
|
1303
|
+
non_terminal_job_ids_subquery = (sqlalchemy.select(
|
|
1304
|
+
spot_table.c.spot_job_id).where(
|
|
1305
|
+
sqlalchemy.or_(
|
|
1306
|
+
spot_table.c.status.is_(None),
|
|
1307
|
+
sqlalchemy.not_(
|
|
1308
|
+
spot_table.c.status.in_(terminal_status_values)),
|
|
1309
|
+
)).distinct())
|
|
1310
|
+
query = query.where(
|
|
1311
|
+
spot_table.c.spot_job_id.in_(non_terminal_job_ids_subquery))
|
|
1312
|
+
if not count_only and not status_count and fields:
|
|
1313
|
+
# Resolve requested field names to explicit ColumnElements from
|
|
1314
|
+
# the joined tables.
|
|
1315
|
+
selected_columns = [_map_response_field_to_db_column(f) for f in fields]
|
|
1316
|
+
query = query.with_only_columns(*selected_columns)
|
|
1317
|
+
if job_ids is not None:
|
|
1318
|
+
query = query.where(spot_table.c.spot_job_id.in_(job_ids))
|
|
1319
|
+
if accessible_workspaces is not None:
|
|
1320
|
+
query = query.where(
|
|
1321
|
+
job_info_table.c.workspace.in_(accessible_workspaces))
|
|
1322
|
+
if workspace_match is not None:
|
|
1323
|
+
query = query.where(
|
|
1324
|
+
job_info_table.c.workspace.like(f'%{workspace_match}%'))
|
|
1325
|
+
if name_match is not None:
|
|
1326
|
+
query = query.where(job_info_table.c.name.like(f'%{name_match}%'))
|
|
1327
|
+
if pool_match is not None:
|
|
1328
|
+
query = query.where(job_info_table.c.pool.like(f'%{pool_match}%'))
|
|
1329
|
+
if user_hashes is not None:
|
|
1330
|
+
query = query.where(job_info_table.c.user_hash.in_(user_hashes))
|
|
1331
|
+
return query
|
|
1332
|
+
|
|
1333
|
+
|
|
1334
|
+
def build_managed_jobs_with_filters_query(
|
|
1335
|
+
fields: Optional[List[str]] = None,
|
|
1336
|
+
job_ids: Optional[List[int]] = None,
|
|
1337
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1338
|
+
workspace_match: Optional[str] = None,
|
|
1339
|
+
name_match: Optional[str] = None,
|
|
1340
|
+
pool_match: Optional[str] = None,
|
|
1341
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1342
|
+
statuses: Optional[List[str]] = None,
|
|
1343
|
+
skip_finished: bool = False,
|
|
1344
|
+
count_only: bool = False,
|
|
1345
|
+
) -> sqlalchemy.Select:
|
|
1346
|
+
"""Build a query to get managed jobs from the database with filters."""
|
|
1347
|
+
query = build_managed_jobs_with_filters_no_status_query(
|
|
1348
|
+
fields=fields,
|
|
1349
|
+
job_ids=job_ids,
|
|
1350
|
+
accessible_workspaces=accessible_workspaces,
|
|
1351
|
+
workspace_match=workspace_match,
|
|
1352
|
+
name_match=name_match,
|
|
1353
|
+
pool_match=pool_match,
|
|
1354
|
+
user_hashes=user_hashes,
|
|
1355
|
+
skip_finished=skip_finished,
|
|
1356
|
+
count_only=count_only,
|
|
1357
|
+
)
|
|
1358
|
+
if statuses is not None:
|
|
1359
|
+
query = query.where(spot_table.c.status.in_(statuses))
|
|
1360
|
+
return query
|
|
1361
|
+
|
|
1362
|
+
|
|
1363
|
+
@_init_db
|
|
1364
|
+
def get_status_count_with_filters(
|
|
1365
|
+
fields: Optional[List[str]] = None,
|
|
1366
|
+
job_ids: Optional[List[int]] = None,
|
|
1367
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1368
|
+
workspace_match: Optional[str] = None,
|
|
1369
|
+
name_match: Optional[str] = None,
|
|
1370
|
+
pool_match: Optional[str] = None,
|
|
1371
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1372
|
+
skip_finished: bool = False,
|
|
1373
|
+
) -> Dict[str, int]:
|
|
1374
|
+
"""Get the status count of the managed jobs with filters."""
|
|
1375
|
+
query = build_managed_jobs_with_filters_no_status_query(
|
|
1376
|
+
fields=fields,
|
|
1377
|
+
job_ids=job_ids,
|
|
1378
|
+
accessible_workspaces=accessible_workspaces,
|
|
1379
|
+
workspace_match=workspace_match,
|
|
1380
|
+
name_match=name_match,
|
|
1381
|
+
pool_match=pool_match,
|
|
1382
|
+
user_hashes=user_hashes,
|
|
1383
|
+
skip_finished=skip_finished,
|
|
1384
|
+
status_count=True,
|
|
1385
|
+
)
|
|
1386
|
+
query = query.group_by(spot_table.c.status)
|
|
1387
|
+
results: Dict[str, int] = {}
|
|
1388
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1389
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1390
|
+
rows = session.execute(query).fetchall()
|
|
1391
|
+
for status_value, count in rows:
|
|
1392
|
+
# status_value is already a string (enum value)
|
|
1393
|
+
results[str(status_value)] = int(count)
|
|
1394
|
+
return results
|
|
1395
|
+
|
|
1396
|
+
|
|
1397
|
+
@_init_db
|
|
1398
|
+
def get_managed_jobs_with_filters(
|
|
1399
|
+
fields: Optional[List[str]] = None,
|
|
1400
|
+
job_ids: Optional[List[int]] = None,
|
|
1401
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1402
|
+
workspace_match: Optional[str] = None,
|
|
1403
|
+
name_match: Optional[str] = None,
|
|
1404
|
+
pool_match: Optional[str] = None,
|
|
1405
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1406
|
+
statuses: Optional[List[str]] = None,
|
|
1407
|
+
skip_finished: bool = False,
|
|
1408
|
+
page: Optional[int] = None,
|
|
1409
|
+
limit: Optional[int] = None,
|
|
1410
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
1411
|
+
"""Get managed jobs from the database with filters."""
|
|
1412
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1413
|
+
|
|
1414
|
+
count_query = build_managed_jobs_with_filters_query(
|
|
1415
|
+
fields=None,
|
|
1416
|
+
job_ids=job_ids,
|
|
1417
|
+
accessible_workspaces=accessible_workspaces,
|
|
1418
|
+
workspace_match=workspace_match,
|
|
1419
|
+
name_match=name_match,
|
|
1420
|
+
pool_match=pool_match,
|
|
1421
|
+
user_hashes=user_hashes,
|
|
1422
|
+
statuses=statuses,
|
|
1423
|
+
skip_finished=skip_finished,
|
|
1424
|
+
count_only=True,
|
|
1425
|
+
)
|
|
1426
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1427
|
+
total = session.execute(count_query).fetchone()[0]
|
|
1428
|
+
|
|
1429
|
+
query = build_managed_jobs_with_filters_query(
|
|
1430
|
+
fields=fields,
|
|
1431
|
+
job_ids=job_ids,
|
|
1432
|
+
accessible_workspaces=accessible_workspaces,
|
|
1433
|
+
workspace_match=workspace_match,
|
|
1434
|
+
name_match=name_match,
|
|
1435
|
+
pool_match=pool_match,
|
|
1436
|
+
user_hashes=user_hashes,
|
|
1437
|
+
statuses=statuses,
|
|
1438
|
+
skip_finished=skip_finished,
|
|
1439
|
+
)
|
|
1440
|
+
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
|
1441
|
+
spot_table.c.task_id.asc())
|
|
1442
|
+
if page is not None and limit is not None:
|
|
1443
|
+
query = query.offset((page - 1) * limit).limit(limit)
|
|
1444
|
+
rows = None
|
|
1445
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1446
|
+
rows = session.execute(query).fetchall()
|
|
1447
|
+
jobs = []
|
|
1448
|
+
for row in rows:
|
|
1449
|
+
job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
|
|
1450
|
+
job_dict['status'] = ManagedJobStatus(job_dict['status'])
|
|
1451
|
+
if job_dict.get('schedule_state') is not None:
|
|
1452
|
+
job_dict['schedule_state'] = ManagedJobScheduleState(
|
|
1453
|
+
job_dict['schedule_state'])
|
|
1454
|
+
if job_dict.get('job_name') is None:
|
|
1455
|
+
job_dict['job_name'] = job_dict.get('task_name')
|
|
1456
|
+
if job_dict.get('metadata') is not None:
|
|
1457
|
+
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
|
1458
|
+
|
|
1459
|
+
# Add user YAML content for managed jobs.
|
|
1460
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
|
1461
|
+
if (not fields or 'user_yaml' in fields) and yaml_path:
|
|
1462
|
+
try:
|
|
1463
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
1464
|
+
job_dict['user_yaml'] = f.read()
|
|
1465
|
+
except (FileNotFoundError, IOError, OSError):
|
|
1466
|
+
job_dict['user_yaml'] = None
|
|
1467
|
+
else:
|
|
1468
|
+
job_dict['user_yaml'] = None
|
|
1469
|
+
|
|
1470
|
+
jobs.append(job_dict)
|
|
1471
|
+
return jobs, total
|
|
1472
|
+
|
|
1473
|
+
|
|
1203
1474
|
@_init_db
|
|
1204
1475
|
def get_task_name(job_id: int, task_id: int) -> str:
|
|
1205
1476
|
"""Get the task name of a job."""
|
|
@@ -1278,25 +1549,6 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
|
1278
1549
|
return pool[0] if pool else None
|
|
1279
1550
|
|
|
1280
1551
|
|
|
1281
|
-
@_init_db
|
|
1282
|
-
def get_pool_and_submit_info_from_job_ids(
|
|
1283
|
-
job_ids: Set[int]
|
|
1284
|
-
) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
|
|
1285
|
-
"""Get the pool, cluster name, and job id on pool from job id"""
|
|
1286
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
1287
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1288
|
-
rows = session.execute(
|
|
1289
|
-
sqlalchemy.select(
|
|
1290
|
-
job_info_table.c.spot_job_id, job_info_table.c.pool,
|
|
1291
|
-
job_info_table.c.current_cluster_name,
|
|
1292
|
-
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1293
|
-
job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
|
|
1294
|
-
return {
|
|
1295
|
-
job_id: (pool, cluster_name, job_id_on_pool_cluster)
|
|
1296
|
-
for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
|
|
1297
|
-
}
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
1552
|
@_init_db
|
|
1301
1553
|
def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
1302
1554
|
"""Set the current cluster name for a job."""
|