skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251023__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +5 -2
- sky/client/cli/command.py +118 -30
- sky/client/cli/table_utils.py +14 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/CJlKj9Z9fXGlQCmH4EpLX/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-165dc0e1553d9822.js +6 -0
- sky/dashboard/out/_next/static/chunks/2755.1ffbda43f960962b.js +26 -0
- sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.27318ad826343ea6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-434b7577d72c879b.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +117 -17
- sky/jobs/client/sdk.py +28 -9
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +7 -3
- sky/jobs/server/server.py +11 -11
- sky/jobs/state.py +307 -55
- sky/jobs/utils.py +281 -166
- sky/schemas/api/responses.py +2 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/serve/server/server.py +7 -7
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +1 -13
- sky/server/requests/executor.py +20 -20
- sky/server/requests/payloads.py +3 -0
- sky/server/requests/requests.py +51 -25
- sky/server/requests/serializers/decoders.py +23 -10
- sky/server/requests/serializers/encoders.py +5 -4
- sky/server/rest.py +35 -1
- sky/server/server.py +34 -34
- sky/setup_files/alembic.ini +4 -0
- sky/skylet/log_lib.py +8 -1
- sky/skylet/services.py +5 -5
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +4 -4
- sky/users/permission.py +4 -0
- sky/utils/db/db_utils.py +32 -3
- sky/utils/db/migration_utils.py +7 -3
- sky/utils/subprocess_utils.py +13 -1
- sky/volumes/server/server.py +3 -3
- sky/workspaces/server.py +6 -6
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/METADATA +36 -35
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/RECORD +84 -83
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-49141c317f3a9020.js +0 -6
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
- sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → CJlKj9Z9fXGlQCmH4EpLX}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/top_level.txt +0 -0
sky/jobs/client/sdk_async.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
"""Async SDK functions for managed jobs."""
|
|
2
2
|
import typing
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
4
4
|
|
|
5
5
|
from sky import backends
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import common as adaptors_common
|
|
8
8
|
from sky.client import sdk_async
|
|
9
9
|
from sky.jobs.client import sdk
|
|
10
|
+
from sky.schemas.api import responses
|
|
10
11
|
from sky.skylet import constants
|
|
11
12
|
from sky.usage import usage_lib
|
|
12
13
|
from sky.utils import common_utils
|
|
@@ -50,12 +51,17 @@ async def queue(
|
|
|
50
51
|
refresh: bool,
|
|
51
52
|
skip_finished: bool = False,
|
|
52
53
|
all_users: bool = False,
|
|
54
|
+
job_ids: Optional[List[int]] = None,
|
|
55
|
+
limit: Optional[int] = None,
|
|
56
|
+
fields: Optional[List[str]] = None,
|
|
53
57
|
stream_logs: Optional[
|
|
54
58
|
sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
|
|
55
|
-
) -> List[
|
|
59
|
+
) -> Union[List[responses.ManagedJobRecord], Tuple[
|
|
60
|
+
List[responses.ManagedJobRecord], int, Dict[str, int], int]]:
|
|
56
61
|
"""Async version of queue() that gets statuses of managed jobs."""
|
|
57
62
|
request_id = await context_utils.to_thread(sdk.queue, refresh,
|
|
58
|
-
skip_finished, all_users
|
|
63
|
+
skip_finished, all_users,
|
|
64
|
+
job_ids, limit, fields)
|
|
59
65
|
if stream_logs is not None:
|
|
60
66
|
return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
|
|
61
67
|
else:
|
sky/jobs/constants.py
CHANGED
|
@@ -46,7 +46,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
|
46
46
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
|
47
47
|
# change for the jobs/utils, we need to bump this version and update
|
|
48
48
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
|
49
|
-
MANAGED_JOBS_VERSION =
|
|
49
|
+
MANAGED_JOBS_VERSION = 12
|
|
50
50
|
|
|
51
51
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
|
52
52
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
sky/jobs/server/core.py
CHANGED
|
@@ -337,6 +337,7 @@ def launch(
|
|
|
337
337
|
def _submit_one(
|
|
338
338
|
consolidation_mode_job_id: Optional[int] = None,
|
|
339
339
|
job_rank: Optional[int] = None,
|
|
340
|
+
num_jobs: Optional[int] = None,
|
|
340
341
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
341
342
|
rank_suffix = '' if job_rank is None else f'-{job_rank}'
|
|
342
343
|
remote_original_user_yaml_path = (
|
|
@@ -359,6 +360,7 @@ def launch(
|
|
|
359
360
|
for task_ in dag.tasks:
|
|
360
361
|
if job_rank is not None:
|
|
361
362
|
task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
|
|
363
|
+
task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
|
|
362
364
|
|
|
363
365
|
dag_utils.dump_chain_dag_to_yaml(dag, f.name)
|
|
364
366
|
|
|
@@ -475,7 +477,7 @@ def launch(
|
|
|
475
477
|
for job_rank in range(num_jobs):
|
|
476
478
|
job_id = (consolidation_mode_job_ids[job_rank]
|
|
477
479
|
if consolidation_mode_job_ids is not None else None)
|
|
478
|
-
jid, handle = _submit_one(job_id, job_rank)
|
|
480
|
+
jid, handle = _submit_one(job_id, job_rank, num_jobs=num_jobs)
|
|
479
481
|
assert jid is not None, (job_id, handle)
|
|
480
482
|
ids.append(jid)
|
|
481
483
|
all_handle = handle
|
|
@@ -663,12 +665,13 @@ def queue_v2_api(
|
|
|
663
665
|
page: Optional[int] = None,
|
|
664
666
|
limit: Optional[int] = None,
|
|
665
667
|
statuses: Optional[List[str]] = None,
|
|
668
|
+
fields: Optional[List[str]] = None,
|
|
666
669
|
) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
|
|
667
670
|
"""Gets statuses of managed jobs and parse the
|
|
668
671
|
jobs to responses.ManagedJobRecord."""
|
|
669
672
|
jobs, total, status_counts, total_no_filter = queue_v2(
|
|
670
673
|
refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
|
|
671
|
-
name_match, pool_match, page, limit, statuses)
|
|
674
|
+
name_match, pool_match, page, limit, statuses, fields)
|
|
672
675
|
return [responses.ManagedJobRecord(**job) for job in jobs
|
|
673
676
|
], total, status_counts, total_no_filter
|
|
674
677
|
|
|
@@ -686,6 +689,7 @@ def queue_v2(
|
|
|
686
689
|
page: Optional[int] = None,
|
|
687
690
|
limit: Optional[int] = None,
|
|
688
691
|
statuses: Optional[List[str]] = None,
|
|
692
|
+
fields: Optional[List[str]] = None,
|
|
689
693
|
) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
|
|
690
694
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
691
695
|
"""Gets statuses of managed jobs with filtering.
|
|
@@ -790,7 +794,7 @@ def queue_v2(
|
|
|
790
794
|
with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
|
|
791
795
|
code = managed_job_utils.ManagedJobCodeGen.get_job_table(
|
|
792
796
|
skip_finished, accessible_workspaces, job_ids, workspace_match,
|
|
793
|
-
name_match, pool_match, page, limit, user_hashes, statuses)
|
|
797
|
+
name_match, pool_match, page, limit, user_hashes, statuses, fields)
|
|
794
798
|
with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
|
|
795
799
|
returncode, job_table_payload, stderr = backend.run_on_head(
|
|
796
800
|
handle,
|
sky/jobs/server/server.py
CHANGED
|
@@ -35,7 +35,7 @@ async def launch(request: fastapi.Request,
|
|
|
35
35
|
consolidation_mode = managed_jobs_utils.is_consolidation_mode()
|
|
36
36
|
schedule_type = (api_requests.ScheduleType.SHORT
|
|
37
37
|
if consolidation_mode else api_requests.ScheduleType.LONG)
|
|
38
|
-
executor.
|
|
38
|
+
await executor.schedule_request_async(
|
|
39
39
|
request_id=request.state.request_id,
|
|
40
40
|
request_name='jobs.launch',
|
|
41
41
|
request_body=jobs_launch_body,
|
|
@@ -50,7 +50,7 @@ async def launch(request: fastapi.Request,
|
|
|
50
50
|
@router.post('/queue')
|
|
51
51
|
async def queue(request: fastapi.Request,
|
|
52
52
|
jobs_queue_body: payloads.JobsQueueBody) -> None:
|
|
53
|
-
executor.
|
|
53
|
+
await executor.schedule_request_async(
|
|
54
54
|
request_id=request.state.request_id,
|
|
55
55
|
request_name='jobs.queue',
|
|
56
56
|
request_body=jobs_queue_body,
|
|
@@ -64,7 +64,7 @@ async def queue(request: fastapi.Request,
|
|
|
64
64
|
@router.post('/queue/v2')
|
|
65
65
|
async def queue_v2(request: fastapi.Request,
|
|
66
66
|
jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
|
|
67
|
-
executor.
|
|
67
|
+
await executor.schedule_request_async(
|
|
68
68
|
request_id=request.state.request_id,
|
|
69
69
|
request_name='jobs.queue_v2',
|
|
70
70
|
request_body=jobs_queue_body_v2,
|
|
@@ -79,7 +79,7 @@ async def queue_v2(request: fastapi.Request,
|
|
|
79
79
|
@router.post('/cancel')
|
|
80
80
|
async def cancel(request: fastapi.Request,
|
|
81
81
|
jobs_cancel_body: payloads.JobsCancelBody) -> None:
|
|
82
|
-
executor.
|
|
82
|
+
await executor.schedule_request_async(
|
|
83
83
|
request_id=request.state.request_id,
|
|
84
84
|
request_name='jobs.cancel',
|
|
85
85
|
request_body=jobs_cancel_body,
|
|
@@ -101,7 +101,7 @@ async def logs(
|
|
|
101
101
|
schedule_type = api_requests.ScheduleType.LONG
|
|
102
102
|
if schedule_type == api_requests.ScheduleType.SHORT:
|
|
103
103
|
executor.check_request_thread_executor_available()
|
|
104
|
-
request_task = executor.
|
|
104
|
+
request_task = await executor.prepare_request_async(
|
|
105
105
|
request_id=request.state.request_id,
|
|
106
106
|
request_name='jobs.logs',
|
|
107
107
|
request_body=jobs_logs_body,
|
|
@@ -141,7 +141,7 @@ async def download_logs(
|
|
|
141
141
|
# We should reuse the original request body, so that the env vars, such as
|
|
142
142
|
# user hash, are kept the same.
|
|
143
143
|
jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
144
|
-
executor.
|
|
144
|
+
await executor.schedule_request_async(
|
|
145
145
|
request_id=request.state.request_id,
|
|
146
146
|
request_name='jobs.download_logs',
|
|
147
147
|
request_body=jobs_download_logs_body,
|
|
@@ -155,7 +155,7 @@ async def download_logs(
|
|
|
155
155
|
@router.post('/pool_apply')
|
|
156
156
|
async def pool_apply(request: fastapi.Request,
|
|
157
157
|
jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
|
|
158
|
-
executor.
|
|
158
|
+
await executor.schedule_request_async(
|
|
159
159
|
request_id=request.state.request_id,
|
|
160
160
|
request_name='jobs.pool_apply',
|
|
161
161
|
request_body=jobs_pool_apply_body,
|
|
@@ -168,7 +168,7 @@ async def pool_apply(request: fastapi.Request,
|
|
|
168
168
|
@router.post('/pool_down')
|
|
169
169
|
async def pool_down(request: fastapi.Request,
|
|
170
170
|
jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
|
|
171
|
-
executor.
|
|
171
|
+
await executor.schedule_request_async(
|
|
172
172
|
request_id=request.state.request_id,
|
|
173
173
|
request_name='jobs.pool_down',
|
|
174
174
|
request_body=jobs_pool_down_body,
|
|
@@ -182,7 +182,7 @@ async def pool_down(request: fastapi.Request,
|
|
|
182
182
|
async def pool_status(
|
|
183
183
|
request: fastapi.Request,
|
|
184
184
|
jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
|
|
185
|
-
executor.
|
|
185
|
+
await executor.schedule_request_async(
|
|
186
186
|
request_id=request.state.request_id,
|
|
187
187
|
request_name='jobs.pool_status',
|
|
188
188
|
request_body=jobs_pool_status_body,
|
|
@@ -197,7 +197,7 @@ async def pool_tail_logs(
|
|
|
197
197
|
request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
|
|
198
198
|
background_tasks: fastapi.BackgroundTasks
|
|
199
199
|
) -> fastapi.responses.StreamingResponse:
|
|
200
|
-
executor.
|
|
200
|
+
await executor.schedule_request_async(
|
|
201
201
|
request_id=request.state.request_id,
|
|
202
202
|
request_name='jobs.pool_logs',
|
|
203
203
|
request_body=log_body,
|
|
@@ -233,7 +233,7 @@ async def pool_download_logs(
|
|
|
233
233
|
# We should reuse the original request body, so that the env vars, such as
|
|
234
234
|
# user hash, are kept the same.
|
|
235
235
|
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
|
236
|
-
executor.
|
|
236
|
+
await executor.schedule_request_async(
|
|
237
237
|
request_id=request.state.request_id,
|
|
238
238
|
request_name='jobs.pool_sync_down_logs',
|
|
239
239
|
request_body=download_logs_body,
|
sky/jobs/state.py
CHANGED
|
@@ -10,8 +10,7 @@ import sqlite3
|
|
|
10
10
|
import threading
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import
|
|
14
|
-
Union)
|
|
13
|
+
from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
|
|
15
14
|
import urllib.parse
|
|
16
15
|
|
|
17
16
|
import colorama
|
|
@@ -315,41 +314,42 @@ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
|
|
|
315
314
|
# by joining the spot and job_info tables.
|
|
316
315
|
def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
|
|
317
316
|
return {
|
|
318
|
-
'_job_id': r
|
|
319
|
-
'_task_name': r
|
|
320
|
-
'resources': r
|
|
321
|
-
'submitted_at': r
|
|
322
|
-
'status': r
|
|
323
|
-
'run_timestamp': r
|
|
324
|
-
'start_at': r
|
|
325
|
-
'end_at': r
|
|
326
|
-
'last_recovered_at': r
|
|
327
|
-
'recovery_count': r
|
|
328
|
-
'job_duration': r
|
|
329
|
-
'failure_reason': r
|
|
330
|
-
'job_id': r
|
|
331
|
-
|
|
332
|
-
'
|
|
333
|
-
'
|
|
334
|
-
'
|
|
335
|
-
'
|
|
317
|
+
'_job_id': r.get('job_id'), # from spot table
|
|
318
|
+
'_task_name': r.get('job_name'), # deprecated, from spot table
|
|
319
|
+
'resources': r.get('resources'),
|
|
320
|
+
'submitted_at': r.get('submitted_at'),
|
|
321
|
+
'status': r.get('status'),
|
|
322
|
+
'run_timestamp': r.get('run_timestamp'),
|
|
323
|
+
'start_at': r.get('start_at'),
|
|
324
|
+
'end_at': r.get('end_at'),
|
|
325
|
+
'last_recovered_at': r.get('last_recovered_at'),
|
|
326
|
+
'recovery_count': r.get('recovery_count'),
|
|
327
|
+
'job_duration': r.get('job_duration'),
|
|
328
|
+
'failure_reason': r.get('failure_reason'),
|
|
329
|
+
'job_id': r.get(spot_table.c.spot_job_id
|
|
330
|
+
), # ambiguous, use table.column
|
|
331
|
+
'task_id': r.get('task_id'),
|
|
332
|
+
'task_name': r.get('task_name'),
|
|
333
|
+
'specs': r.get('specs'),
|
|
334
|
+
'local_log_file': r.get('local_log_file'),
|
|
335
|
+
'metadata': r.get('metadata'),
|
|
336
336
|
# columns from job_info table (some may be None for legacy jobs)
|
|
337
|
-
'_job_info_job_id': r
|
|
338
|
-
|
|
339
|
-
'job_name': r
|
|
340
|
-
'schedule_state': r
|
|
341
|
-
'controller_pid': r
|
|
342
|
-
'dag_yaml_path': r
|
|
343
|
-
'env_file_path': r
|
|
344
|
-
'user_hash': r
|
|
345
|
-
'workspace': r
|
|
346
|
-
'priority': r
|
|
347
|
-
'entrypoint': r
|
|
348
|
-
'original_user_yaml_path': r
|
|
349
|
-
'pool': r
|
|
350
|
-
'current_cluster_name': r
|
|
351
|
-
'job_id_on_pool_cluster': r
|
|
352
|
-
'pool_hash': r
|
|
337
|
+
'_job_info_job_id': r.get(job_info_table.c.spot_job_id
|
|
338
|
+
), # ambiguous, use table.column
|
|
339
|
+
'job_name': r.get('name'), # from job_info table
|
|
340
|
+
'schedule_state': r.get('schedule_state'),
|
|
341
|
+
'controller_pid': r.get('controller_pid'),
|
|
342
|
+
'dag_yaml_path': r.get('dag_yaml_path'),
|
|
343
|
+
'env_file_path': r.get('env_file_path'),
|
|
344
|
+
'user_hash': r.get('user_hash'),
|
|
345
|
+
'workspace': r.get('workspace'),
|
|
346
|
+
'priority': r.get('priority'),
|
|
347
|
+
'entrypoint': r.get('entrypoint'),
|
|
348
|
+
'original_user_yaml_path': r.get('original_user_yaml_path'),
|
|
349
|
+
'pool': r.get('pool'),
|
|
350
|
+
'current_cluster_name': r.get('current_cluster_name'),
|
|
351
|
+
'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
|
|
352
|
+
'pool_hash': r.get('pool_hash'),
|
|
353
353
|
}
|
|
354
354
|
|
|
355
355
|
|
|
@@ -1200,6 +1200,277 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
|
1200
1200
|
return jobs
|
|
1201
1201
|
|
|
1202
1202
|
|
|
1203
|
+
def _map_response_field_to_db_column(field: str):
|
|
1204
|
+
"""Map the response field name to an actual SQLAlchemy ColumnElement.
|
|
1205
|
+
|
|
1206
|
+
This ensures we never pass plain strings to SQLAlchemy 2.0 APIs like
|
|
1207
|
+
Select.with_only_columns().
|
|
1208
|
+
"""
|
|
1209
|
+
# Explicit aliases differing from actual DB column names
|
|
1210
|
+
alias_mapping = {
|
|
1211
|
+
'_job_id': spot_table.c.job_id, # spot.job_id
|
|
1212
|
+
'_task_name': spot_table.c.job_name, # deprecated, from spot table
|
|
1213
|
+
'job_id': spot_table.c.spot_job_id, # public job id -> spot.spot_job_id
|
|
1214
|
+
'_job_info_job_id': job_info_table.c.spot_job_id,
|
|
1215
|
+
'job_name': job_info_table.c.name, # public job name -> job_info.name
|
|
1216
|
+
}
|
|
1217
|
+
if field in alias_mapping:
|
|
1218
|
+
return alias_mapping[field]
|
|
1219
|
+
|
|
1220
|
+
# Try direct match on the `spot` table columns
|
|
1221
|
+
if field in spot_table.c:
|
|
1222
|
+
return spot_table.c[field]
|
|
1223
|
+
|
|
1224
|
+
# Try direct match on the `job_info` table columns
|
|
1225
|
+
if field in job_info_table.c:
|
|
1226
|
+
return job_info_table.c[field]
|
|
1227
|
+
|
|
1228
|
+
raise ValueError(f'Unknown field: {field}')
|
|
1229
|
+
|
|
1230
|
+
|
|
1231
|
+
@_init_db
|
|
1232
|
+
def get_managed_jobs_total() -> int:
|
|
1233
|
+
"""Get the total number of managed jobs."""
|
|
1234
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1235
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1236
|
+
result = session.execute(
|
|
1237
|
+
sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
|
|
1238
|
+
).select_from(spot_table)).fetchone()
|
|
1239
|
+
return result[0] if result else 0
|
|
1240
|
+
|
|
1241
|
+
|
|
1242
|
+
@_init_db
|
|
1243
|
+
def get_managed_jobs_highest_priority() -> int:
|
|
1244
|
+
"""Get the highest priority of the managed jobs."""
|
|
1245
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1246
|
+
query = sqlalchemy.select(sqlalchemy.func.max(
|
|
1247
|
+
job_info_table.c.priority)).where(
|
|
1248
|
+
sqlalchemy.and_(
|
|
1249
|
+
job_info_table.c.schedule_state.in_([
|
|
1250
|
+
ManagedJobScheduleState.LAUNCHING.value,
|
|
1251
|
+
ManagedJobScheduleState.ALIVE_BACKOFF.value,
|
|
1252
|
+
ManagedJobScheduleState.WAITING.value,
|
|
1253
|
+
ManagedJobScheduleState.ALIVE_WAITING.value,
|
|
1254
|
+
]),
|
|
1255
|
+
job_info_table.c.priority.is_not(None),
|
|
1256
|
+
))
|
|
1257
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1258
|
+
priority = session.execute(query).fetchone()
|
|
1259
|
+
return priority[0] if priority and priority[
|
|
1260
|
+
0] is not None else constants.MIN_PRIORITY
|
|
1261
|
+
|
|
1262
|
+
|
|
1263
|
+
def build_managed_jobs_with_filters_no_status_query(
|
|
1264
|
+
fields: Optional[List[str]] = None,
|
|
1265
|
+
job_ids: Optional[List[int]] = None,
|
|
1266
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1267
|
+
workspace_match: Optional[str] = None,
|
|
1268
|
+
name_match: Optional[str] = None,
|
|
1269
|
+
pool_match: Optional[str] = None,
|
|
1270
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1271
|
+
skip_finished: bool = False,
|
|
1272
|
+
count_only: bool = False,
|
|
1273
|
+
status_count: bool = False,
|
|
1274
|
+
) -> sqlalchemy.Select:
|
|
1275
|
+
"""Build a query to get managed jobs from the database with filters."""
|
|
1276
|
+
# Join spot and job_info tables to get the job name for each task.
|
|
1277
|
+
# We use LEFT OUTER JOIN mainly for backward compatibility, as for an
|
|
1278
|
+
# existing controller before #1982, the job_info table may not exist,
|
|
1279
|
+
# and all the managed jobs created before will not present in the
|
|
1280
|
+
# job_info.
|
|
1281
|
+
# Note: we will get the user_hash here, but don't try to call
|
|
1282
|
+
# global_user_state.get_user() on it. This runs on the controller, which may
|
|
1283
|
+
# not have the user info. Prefer to do it on the API server side.
|
|
1284
|
+
if count_only:
|
|
1285
|
+
query = sqlalchemy.select(sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
|
|
1286
|
+
elif status_count:
|
|
1287
|
+
query = sqlalchemy.select(spot_table.c.status,
|
|
1288
|
+
sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
|
|
1289
|
+
else:
|
|
1290
|
+
query = sqlalchemy.select(spot_table, job_info_table)
|
|
1291
|
+
query = query.select_from(
|
|
1292
|
+
spot_table.outerjoin(
|
|
1293
|
+
job_info_table,
|
|
1294
|
+
spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
|
|
1295
|
+
if skip_finished:
|
|
1296
|
+
# Filter out finished jobs at the DB level. If a multi-task job is
|
|
1297
|
+
# partially finished, include all its tasks. We do this by first
|
|
1298
|
+
# selecting job_ids that have at least one non-terminal task, then
|
|
1299
|
+
# restricting the main query to those job_ids.
|
|
1300
|
+
terminal_status_values = [
|
|
1301
|
+
s.value for s in ManagedJobStatus.terminal_statuses()
|
|
1302
|
+
]
|
|
1303
|
+
non_terminal_job_ids_subquery = (sqlalchemy.select(
|
|
1304
|
+
spot_table.c.spot_job_id).where(
|
|
1305
|
+
sqlalchemy.or_(
|
|
1306
|
+
spot_table.c.status.is_(None),
|
|
1307
|
+
sqlalchemy.not_(
|
|
1308
|
+
spot_table.c.status.in_(terminal_status_values)),
|
|
1309
|
+
)).distinct())
|
|
1310
|
+
query = query.where(
|
|
1311
|
+
spot_table.c.spot_job_id.in_(non_terminal_job_ids_subquery))
|
|
1312
|
+
if not count_only and not status_count and fields:
|
|
1313
|
+
# Resolve requested field names to explicit ColumnElements from
|
|
1314
|
+
# the joined tables.
|
|
1315
|
+
selected_columns = [_map_response_field_to_db_column(f) for f in fields]
|
|
1316
|
+
query = query.with_only_columns(*selected_columns)
|
|
1317
|
+
if job_ids is not None:
|
|
1318
|
+
query = query.where(spot_table.c.spot_job_id.in_(job_ids))
|
|
1319
|
+
if accessible_workspaces is not None:
|
|
1320
|
+
query = query.where(
|
|
1321
|
+
job_info_table.c.workspace.in_(accessible_workspaces))
|
|
1322
|
+
if workspace_match is not None:
|
|
1323
|
+
query = query.where(
|
|
1324
|
+
job_info_table.c.workspace.like(f'%{workspace_match}%'))
|
|
1325
|
+
if name_match is not None:
|
|
1326
|
+
query = query.where(job_info_table.c.name.like(f'%{name_match}%'))
|
|
1327
|
+
if pool_match is not None:
|
|
1328
|
+
query = query.where(job_info_table.c.pool.like(f'%{pool_match}%'))
|
|
1329
|
+
if user_hashes is not None:
|
|
1330
|
+
query = query.where(job_info_table.c.user_hash.in_(user_hashes))
|
|
1331
|
+
return query
|
|
1332
|
+
|
|
1333
|
+
|
|
1334
|
+
def build_managed_jobs_with_filters_query(
|
|
1335
|
+
fields: Optional[List[str]] = None,
|
|
1336
|
+
job_ids: Optional[List[int]] = None,
|
|
1337
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1338
|
+
workspace_match: Optional[str] = None,
|
|
1339
|
+
name_match: Optional[str] = None,
|
|
1340
|
+
pool_match: Optional[str] = None,
|
|
1341
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1342
|
+
statuses: Optional[List[str]] = None,
|
|
1343
|
+
skip_finished: bool = False,
|
|
1344
|
+
count_only: bool = False,
|
|
1345
|
+
) -> sqlalchemy.Select:
|
|
1346
|
+
"""Build a query to get managed jobs from the database with filters."""
|
|
1347
|
+
query = build_managed_jobs_with_filters_no_status_query(
|
|
1348
|
+
fields=fields,
|
|
1349
|
+
job_ids=job_ids,
|
|
1350
|
+
accessible_workspaces=accessible_workspaces,
|
|
1351
|
+
workspace_match=workspace_match,
|
|
1352
|
+
name_match=name_match,
|
|
1353
|
+
pool_match=pool_match,
|
|
1354
|
+
user_hashes=user_hashes,
|
|
1355
|
+
skip_finished=skip_finished,
|
|
1356
|
+
count_only=count_only,
|
|
1357
|
+
)
|
|
1358
|
+
if statuses is not None:
|
|
1359
|
+
query = query.where(spot_table.c.status.in_(statuses))
|
|
1360
|
+
return query
|
|
1361
|
+
|
|
1362
|
+
|
|
1363
|
+
@_init_db
|
|
1364
|
+
def get_status_count_with_filters(
|
|
1365
|
+
fields: Optional[List[str]] = None,
|
|
1366
|
+
job_ids: Optional[List[int]] = None,
|
|
1367
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1368
|
+
workspace_match: Optional[str] = None,
|
|
1369
|
+
name_match: Optional[str] = None,
|
|
1370
|
+
pool_match: Optional[str] = None,
|
|
1371
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1372
|
+
skip_finished: bool = False,
|
|
1373
|
+
) -> Dict[str, int]:
|
|
1374
|
+
"""Get the status count of the managed jobs with filters."""
|
|
1375
|
+
query = build_managed_jobs_with_filters_no_status_query(
|
|
1376
|
+
fields=fields,
|
|
1377
|
+
job_ids=job_ids,
|
|
1378
|
+
accessible_workspaces=accessible_workspaces,
|
|
1379
|
+
workspace_match=workspace_match,
|
|
1380
|
+
name_match=name_match,
|
|
1381
|
+
pool_match=pool_match,
|
|
1382
|
+
user_hashes=user_hashes,
|
|
1383
|
+
skip_finished=skip_finished,
|
|
1384
|
+
status_count=True,
|
|
1385
|
+
)
|
|
1386
|
+
query = query.group_by(spot_table.c.status)
|
|
1387
|
+
results: Dict[str, int] = {}
|
|
1388
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1389
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1390
|
+
rows = session.execute(query).fetchall()
|
|
1391
|
+
for status_value, count in rows:
|
|
1392
|
+
# status_value is already a string (enum value)
|
|
1393
|
+
results[str(status_value)] = int(count)
|
|
1394
|
+
return results
|
|
1395
|
+
|
|
1396
|
+
|
|
1397
|
+
@_init_db
|
|
1398
|
+
def get_managed_jobs_with_filters(
|
|
1399
|
+
fields: Optional[List[str]] = None,
|
|
1400
|
+
job_ids: Optional[List[int]] = None,
|
|
1401
|
+
accessible_workspaces: Optional[List[str]] = None,
|
|
1402
|
+
workspace_match: Optional[str] = None,
|
|
1403
|
+
name_match: Optional[str] = None,
|
|
1404
|
+
pool_match: Optional[str] = None,
|
|
1405
|
+
user_hashes: Optional[List[Optional[str]]] = None,
|
|
1406
|
+
statuses: Optional[List[str]] = None,
|
|
1407
|
+
skip_finished: bool = False,
|
|
1408
|
+
page: Optional[int] = None,
|
|
1409
|
+
limit: Optional[int] = None,
|
|
1410
|
+
) -> Tuple[List[Dict[str, Any]], int]:
|
|
1411
|
+
"""Get managed jobs from the database with filters."""
|
|
1412
|
+
assert _SQLALCHEMY_ENGINE is not None
|
|
1413
|
+
|
|
1414
|
+
count_query = build_managed_jobs_with_filters_query(
|
|
1415
|
+
fields=None,
|
|
1416
|
+
job_ids=job_ids,
|
|
1417
|
+
accessible_workspaces=accessible_workspaces,
|
|
1418
|
+
workspace_match=workspace_match,
|
|
1419
|
+
name_match=name_match,
|
|
1420
|
+
pool_match=pool_match,
|
|
1421
|
+
user_hashes=user_hashes,
|
|
1422
|
+
statuses=statuses,
|
|
1423
|
+
skip_finished=skip_finished,
|
|
1424
|
+
count_only=True,
|
|
1425
|
+
)
|
|
1426
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1427
|
+
total = session.execute(count_query).fetchone()[0]
|
|
1428
|
+
|
|
1429
|
+
query = build_managed_jobs_with_filters_query(
|
|
1430
|
+
fields=fields,
|
|
1431
|
+
job_ids=job_ids,
|
|
1432
|
+
accessible_workspaces=accessible_workspaces,
|
|
1433
|
+
workspace_match=workspace_match,
|
|
1434
|
+
name_match=name_match,
|
|
1435
|
+
pool_match=pool_match,
|
|
1436
|
+
user_hashes=user_hashes,
|
|
1437
|
+
statuses=statuses,
|
|
1438
|
+
skip_finished=skip_finished,
|
|
1439
|
+
)
|
|
1440
|
+
query = query.order_by(spot_table.c.spot_job_id.desc(),
|
|
1441
|
+
spot_table.c.task_id.asc())
|
|
1442
|
+
if page is not None and limit is not None:
|
|
1443
|
+
query = query.offset((page - 1) * limit).limit(limit)
|
|
1444
|
+
rows = None
|
|
1445
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1446
|
+
rows = session.execute(query).fetchall()
|
|
1447
|
+
jobs = []
|
|
1448
|
+
for row in rows:
|
|
1449
|
+
job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
|
|
1450
|
+
job_dict['status'] = ManagedJobStatus(job_dict['status'])
|
|
1451
|
+
if job_dict.get('schedule_state') is not None:
|
|
1452
|
+
job_dict['schedule_state'] = ManagedJobScheduleState(
|
|
1453
|
+
job_dict['schedule_state'])
|
|
1454
|
+
if job_dict.get('job_name') is None:
|
|
1455
|
+
job_dict['job_name'] = job_dict.get('task_name')
|
|
1456
|
+
if job_dict.get('metadata') is not None:
|
|
1457
|
+
job_dict['metadata'] = json.loads(job_dict['metadata'])
|
|
1458
|
+
|
|
1459
|
+
# Add user YAML content for managed jobs.
|
|
1460
|
+
yaml_path = job_dict.get('original_user_yaml_path')
|
|
1461
|
+
if (not fields or 'user_yaml' in fields) and yaml_path:
|
|
1462
|
+
try:
|
|
1463
|
+
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
1464
|
+
job_dict['user_yaml'] = f.read()
|
|
1465
|
+
except (FileNotFoundError, IOError, OSError):
|
|
1466
|
+
job_dict['user_yaml'] = None
|
|
1467
|
+
else:
|
|
1468
|
+
job_dict['user_yaml'] = None
|
|
1469
|
+
|
|
1470
|
+
jobs.append(job_dict)
|
|
1471
|
+
return jobs, total
|
|
1472
|
+
|
|
1473
|
+
|
|
1203
1474
|
@_init_db
|
|
1204
1475
|
def get_task_name(job_id: int, task_id: int) -> str:
|
|
1205
1476
|
"""Get the task name of a job."""
|
|
@@ -1278,25 +1549,6 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
|
|
|
1278
1549
|
return pool[0] if pool else None
|
|
1279
1550
|
|
|
1280
1551
|
|
|
1281
|
-
@_init_db
|
|
1282
|
-
def get_pool_and_submit_info_from_job_ids(
|
|
1283
|
-
job_ids: Set[int]
|
|
1284
|
-
) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
|
|
1285
|
-
"""Get the pool, cluster name, and job id on pool from job id"""
|
|
1286
|
-
assert _SQLALCHEMY_ENGINE is not None
|
|
1287
|
-
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
|
1288
|
-
rows = session.execute(
|
|
1289
|
-
sqlalchemy.select(
|
|
1290
|
-
job_info_table.c.spot_job_id, job_info_table.c.pool,
|
|
1291
|
-
job_info_table.c.current_cluster_name,
|
|
1292
|
-
job_info_table.c.job_id_on_pool_cluster).where(
|
|
1293
|
-
job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
|
|
1294
|
-
return {
|
|
1295
|
-
job_id: (pool, cluster_name, job_id_on_pool_cluster)
|
|
1296
|
-
for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
|
|
1297
|
-
}
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
1552
|
@_init_db
|
|
1301
1553
|
def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
|
|
1302
1554
|
"""Set the current cluster name for a job."""
|