skypilot-nightly 1.0.0.dev20251021__py3-none-any.whl → 1.0.0.dev20251023__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (93) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +5 -2
  3. sky/client/cli/command.py +118 -30
  4. sky/client/cli/table_utils.py +14 -8
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/CJlKj9Z9fXGlQCmH4EpLX/_buildManifest.js +1 -0
  7. sky/dashboard/out/_next/static/chunks/1141-ec6f902ffb865853.js +11 -0
  8. sky/dashboard/out/_next/static/chunks/1871-165dc0e1553d9822.js +6 -0
  9. sky/dashboard/out/_next/static/chunks/2755.1ffbda43f960962b.js +26 -0
  10. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/{3294.1fafbf42b3bcebff.js → 3294.27318ad826343ea6.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.483a3dda2d52f26e.js} +1 -1
  13. sky/dashboard/out/_next/static/chunks/{1121-d0782b9251f0fcd3.js → 4282-d2f3ef2fbf78e347.js} +1 -1
  14. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/6856-5c94d394259cdb6e.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +31 -0
  18. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-8f058b0346db2aff.js → [job]-602eeead010ec1d6.js} +1 -1
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-18b334dedbd9f6f2.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-57221ec2e4e01076.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-44ce535a0a0ad4ec.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-872e6a00165534f4.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-0dc34cf9a8710a9f.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-3a543725492fb896.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-d2af9d22e87cc4ba.js} +1 -1
  26. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-9ad108cd67d16d96.js} +1 -1
  27. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-6fc994fa1ee6c6bf.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/webpack-434b7577d72c879b.js +1 -0
  29. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  30. sky/dashboard/out/clusters/[cluster].html +1 -1
  31. sky/dashboard/out/clusters.html +1 -1
  32. sky/dashboard/out/config.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra/[context].html +1 -1
  35. sky/dashboard/out/infra.html +1 -1
  36. sky/dashboard/out/jobs/[job].html +1 -1
  37. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  38. sky/dashboard/out/jobs.html +1 -1
  39. sky/dashboard/out/users.html +1 -1
  40. sky/dashboard/out/volumes.html +1 -1
  41. sky/dashboard/out/workspace/new.html +1 -1
  42. sky/dashboard/out/workspaces/[name].html +1 -1
  43. sky/dashboard/out/workspaces.html +1 -1
  44. sky/global_user_state.py +117 -17
  45. sky/jobs/client/sdk.py +28 -9
  46. sky/jobs/client/sdk_async.py +9 -3
  47. sky/jobs/constants.py +1 -1
  48. sky/jobs/server/core.py +7 -3
  49. sky/jobs/server/server.py +11 -11
  50. sky/jobs/state.py +307 -55
  51. sky/jobs/utils.py +281 -166
  52. sky/schemas/api/responses.py +2 -0
  53. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  54. sky/serve/server/server.py +7 -7
  55. sky/server/auth/oauth2_proxy.py +2 -5
  56. sky/server/common.py +1 -13
  57. sky/server/requests/executor.py +20 -20
  58. sky/server/requests/payloads.py +3 -0
  59. sky/server/requests/requests.py +51 -25
  60. sky/server/requests/serializers/decoders.py +23 -10
  61. sky/server/requests/serializers/encoders.py +5 -4
  62. sky/server/rest.py +35 -1
  63. sky/server/server.py +34 -34
  64. sky/setup_files/alembic.ini +4 -0
  65. sky/skylet/log_lib.py +8 -1
  66. sky/skylet/services.py +5 -5
  67. sky/skylet/subprocess_daemon.py +103 -29
  68. sky/skypilot_config.py +87 -75
  69. sky/ssh_node_pools/server.py +4 -4
  70. sky/users/permission.py +4 -0
  71. sky/utils/db/db_utils.py +32 -3
  72. sky/utils/db/migration_utils.py +7 -3
  73. sky/utils/subprocess_utils.py +13 -1
  74. sky/volumes/server/server.py +3 -3
  75. sky/workspaces/server.py +6 -6
  76. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/METADATA +36 -35
  77. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/RECORD +84 -83
  78. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  79. sky/dashboard/out/_next/static/chunks/1871-49141c317f3a9020.js +0 -6
  80. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  81. sky/dashboard/out/_next/static/chunks/3015-7e0e8f06bb2f881c.js +0 -1
  82. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  83. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  84. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  85. sky/dashboard/out/_next/static/chunks/webpack-66f23594d38c7f16.js +0 -1
  86. sky/dashboard/out/_next/static/jDc1PlRsl9Cc5FQUMLBu8/_buildManifest.js +0 -1
  87. /sky/dashboard/out/_next/static/{jDc1PlRsl9Cc5FQUMLBu8 → CJlKj9Z9fXGlQCmH4EpLX}/_ssgManifest.js +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-e5c9ce6a24fc0de4.js → [job]-8677af16befde039.js} +0 -0
  89. /sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-e020fd69dbe76cea.js} +0 -0
  90. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/WHEEL +0 -0
  91. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/entry_points.txt +0 -0
  92. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/licenses/LICENSE +0 -0
  93. {skypilot_nightly-1.0.0.dev20251021.dist-info → skypilot_nightly-1.0.0.dev20251023.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,13 @@
1
1
  """Async SDK functions for managed jobs."""
2
2
  import typing
3
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import Dict, List, Optional, Tuple, Union
4
4
 
5
5
  from sky import backends
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
8
8
  from sky.client import sdk_async
9
9
  from sky.jobs.client import sdk
10
+ from sky.schemas.api import responses
10
11
  from sky.skylet import constants
11
12
  from sky.usage import usage_lib
12
13
  from sky.utils import common_utils
@@ -50,12 +51,17 @@ async def queue(
50
51
  refresh: bool,
51
52
  skip_finished: bool = False,
52
53
  all_users: bool = False,
54
+ job_ids: Optional[List[int]] = None,
55
+ limit: Optional[int] = None,
56
+ fields: Optional[List[str]] = None,
53
57
  stream_logs: Optional[
54
58
  sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
55
- ) -> List[Dict[str, Any]]:
59
+ ) -> Union[List[responses.ManagedJobRecord], Tuple[
60
+ List[responses.ManagedJobRecord], int, Dict[str, int], int]]:
56
61
  """Async version of queue() that gets statuses of managed jobs."""
57
62
  request_id = await context_utils.to_thread(sdk.queue, refresh,
58
- skip_finished, all_users)
63
+ skip_finished, all_users,
64
+ job_ids, limit, fields)
59
65
  if stream_logs is not None:
60
66
  return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
61
67
  else:
sky/jobs/constants.py CHANGED
@@ -46,7 +46,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
46
46
  # The version of the lib files that jobs/utils use. Whenever there is an API
47
47
  # change for the jobs/utils, we need to bump this version and update
48
48
  # job.utils.ManagedJobCodeGen to handle the version update.
49
- MANAGED_JOBS_VERSION = 11
49
+ MANAGED_JOBS_VERSION = 12
50
50
 
51
51
  # The command for setting up the jobs dashboard on the controller. It firstly
52
52
  # checks if the systemd services are available, and if not (e.g., Kubernetes
sky/jobs/server/core.py CHANGED
@@ -337,6 +337,7 @@ def launch(
337
337
  def _submit_one(
338
338
  consolidation_mode_job_id: Optional[int] = None,
339
339
  job_rank: Optional[int] = None,
340
+ num_jobs: Optional[int] = None,
340
341
  ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
341
342
  rank_suffix = '' if job_rank is None else f'-{job_rank}'
342
343
  remote_original_user_yaml_path = (
@@ -359,6 +360,7 @@ def launch(
359
360
  for task_ in dag.tasks:
360
361
  if job_rank is not None:
361
362
  task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
363
+ task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
362
364
 
363
365
  dag_utils.dump_chain_dag_to_yaml(dag, f.name)
364
366
 
@@ -475,7 +477,7 @@ def launch(
475
477
  for job_rank in range(num_jobs):
476
478
  job_id = (consolidation_mode_job_ids[job_rank]
477
479
  if consolidation_mode_job_ids is not None else None)
478
- jid, handle = _submit_one(job_id, job_rank)
480
+ jid, handle = _submit_one(job_id, job_rank, num_jobs=num_jobs)
479
481
  assert jid is not None, (job_id, handle)
480
482
  ids.append(jid)
481
483
  all_handle = handle
@@ -663,12 +665,13 @@ def queue_v2_api(
663
665
  page: Optional[int] = None,
664
666
  limit: Optional[int] = None,
665
667
  statuses: Optional[List[str]] = None,
668
+ fields: Optional[List[str]] = None,
666
669
  ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
667
670
  """Gets statuses of managed jobs and parse the
668
671
  jobs to responses.ManagedJobRecord."""
669
672
  jobs, total, status_counts, total_no_filter = queue_v2(
670
673
  refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
671
- name_match, pool_match, page, limit, statuses)
674
+ name_match, pool_match, page, limit, statuses, fields)
672
675
  return [responses.ManagedJobRecord(**job) for job in jobs
673
676
  ], total, status_counts, total_no_filter
674
677
 
@@ -686,6 +689,7 @@ def queue_v2(
686
689
  page: Optional[int] = None,
687
690
  limit: Optional[int] = None,
688
691
  statuses: Optional[List[str]] = None,
692
+ fields: Optional[List[str]] = None,
689
693
  ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
690
694
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
691
695
  """Gets statuses of managed jobs with filtering.
@@ -790,7 +794,7 @@ def queue_v2(
790
794
  with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
791
795
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
792
796
  skip_finished, accessible_workspaces, job_ids, workspace_match,
793
- name_match, pool_match, page, limit, user_hashes, statuses)
797
+ name_match, pool_match, page, limit, user_hashes, statuses, fields)
794
798
  with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
795
799
  returncode, job_table_payload, stderr = backend.run_on_head(
796
800
  handle,
sky/jobs/server/server.py CHANGED
@@ -35,7 +35,7 @@ async def launch(request: fastapi.Request,
35
35
  consolidation_mode = managed_jobs_utils.is_consolidation_mode()
36
36
  schedule_type = (api_requests.ScheduleType.SHORT
37
37
  if consolidation_mode else api_requests.ScheduleType.LONG)
38
- executor.schedule_request(
38
+ await executor.schedule_request_async(
39
39
  request_id=request.state.request_id,
40
40
  request_name='jobs.launch',
41
41
  request_body=jobs_launch_body,
@@ -50,7 +50,7 @@ async def launch(request: fastapi.Request,
50
50
  @router.post('/queue')
51
51
  async def queue(request: fastapi.Request,
52
52
  jobs_queue_body: payloads.JobsQueueBody) -> None:
53
- executor.schedule_request(
53
+ await executor.schedule_request_async(
54
54
  request_id=request.state.request_id,
55
55
  request_name='jobs.queue',
56
56
  request_body=jobs_queue_body,
@@ -64,7 +64,7 @@ async def queue(request: fastapi.Request,
64
64
  @router.post('/queue/v2')
65
65
  async def queue_v2(request: fastapi.Request,
66
66
  jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
67
- executor.schedule_request(
67
+ await executor.schedule_request_async(
68
68
  request_id=request.state.request_id,
69
69
  request_name='jobs.queue_v2',
70
70
  request_body=jobs_queue_body_v2,
@@ -79,7 +79,7 @@ async def queue_v2(request: fastapi.Request,
79
79
  @router.post('/cancel')
80
80
  async def cancel(request: fastapi.Request,
81
81
  jobs_cancel_body: payloads.JobsCancelBody) -> None:
82
- executor.schedule_request(
82
+ await executor.schedule_request_async(
83
83
  request_id=request.state.request_id,
84
84
  request_name='jobs.cancel',
85
85
  request_body=jobs_cancel_body,
@@ -101,7 +101,7 @@ async def logs(
101
101
  schedule_type = api_requests.ScheduleType.LONG
102
102
  if schedule_type == api_requests.ScheduleType.SHORT:
103
103
  executor.check_request_thread_executor_available()
104
- request_task = executor.prepare_request(
104
+ request_task = await executor.prepare_request_async(
105
105
  request_id=request.state.request_id,
106
106
  request_name='jobs.logs',
107
107
  request_body=jobs_logs_body,
@@ -141,7 +141,7 @@ async def download_logs(
141
141
  # We should reuse the original request body, so that the env vars, such as
142
142
  # user hash, are kept the same.
143
143
  jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
144
- executor.schedule_request(
144
+ await executor.schedule_request_async(
145
145
  request_id=request.state.request_id,
146
146
  request_name='jobs.download_logs',
147
147
  request_body=jobs_download_logs_body,
@@ -155,7 +155,7 @@ async def download_logs(
155
155
  @router.post('/pool_apply')
156
156
  async def pool_apply(request: fastapi.Request,
157
157
  jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
158
- executor.schedule_request(
158
+ await executor.schedule_request_async(
159
159
  request_id=request.state.request_id,
160
160
  request_name='jobs.pool_apply',
161
161
  request_body=jobs_pool_apply_body,
@@ -168,7 +168,7 @@ async def pool_apply(request: fastapi.Request,
168
168
  @router.post('/pool_down')
169
169
  async def pool_down(request: fastapi.Request,
170
170
  jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
171
- executor.schedule_request(
171
+ await executor.schedule_request_async(
172
172
  request_id=request.state.request_id,
173
173
  request_name='jobs.pool_down',
174
174
  request_body=jobs_pool_down_body,
@@ -182,7 +182,7 @@ async def pool_down(request: fastapi.Request,
182
182
  async def pool_status(
183
183
  request: fastapi.Request,
184
184
  jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
185
- executor.schedule_request(
185
+ await executor.schedule_request_async(
186
186
  request_id=request.state.request_id,
187
187
  request_name='jobs.pool_status',
188
188
  request_body=jobs_pool_status_body,
@@ -197,7 +197,7 @@ async def pool_tail_logs(
197
197
  request: fastapi.Request, log_body: payloads.JobsPoolLogsBody,
198
198
  background_tasks: fastapi.BackgroundTasks
199
199
  ) -> fastapi.responses.StreamingResponse:
200
- executor.schedule_request(
200
+ await executor.schedule_request_async(
201
201
  request_id=request.state.request_id,
202
202
  request_name='jobs.pool_logs',
203
203
  request_body=log_body,
@@ -233,7 +233,7 @@ async def pool_download_logs(
233
233
  # We should reuse the original request body, so that the env vars, such as
234
234
  # user hash, are kept the same.
235
235
  download_logs_body.local_dir = str(logs_dir_on_api_server)
236
- executor.schedule_request(
236
+ await executor.schedule_request_async(
237
237
  request_id=request.state.request_id,
238
238
  request_name='jobs.pool_sync_down_logs',
239
239
  request_body=download_logs_body,
sky/jobs/state.py CHANGED
@@ -10,8 +10,7 @@ import sqlite3
10
10
  import threading
11
11
  import time
12
12
  import typing
13
- from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
14
- Union)
13
+ from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
15
14
  import urllib.parse
16
15
 
17
16
  import colorama
@@ -315,41 +314,42 @@ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
315
314
  # by joining the spot and job_info tables.
316
315
  def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
317
316
  return {
318
- '_job_id': r['job_id'], # from spot table
319
- '_task_name': r['job_name'], # deprecated, from spot table
320
- 'resources': r['resources'],
321
- 'submitted_at': r['submitted_at'],
322
- 'status': r['status'],
323
- 'run_timestamp': r['run_timestamp'],
324
- 'start_at': r['start_at'],
325
- 'end_at': r['end_at'],
326
- 'last_recovered_at': r['last_recovered_at'],
327
- 'recovery_count': r['recovery_count'],
328
- 'job_duration': r['job_duration'],
329
- 'failure_reason': r['failure_reason'],
330
- 'job_id': r[spot_table.c.spot_job_id], # ambiguous, use table.column
331
- 'task_id': r['task_id'],
332
- 'task_name': r['task_name'],
333
- 'specs': r['specs'],
334
- 'local_log_file': r['local_log_file'],
335
- 'metadata': r['metadata'],
317
+ '_job_id': r.get('job_id'), # from spot table
318
+ '_task_name': r.get('job_name'), # deprecated, from spot table
319
+ 'resources': r.get('resources'),
320
+ 'submitted_at': r.get('submitted_at'),
321
+ 'status': r.get('status'),
322
+ 'run_timestamp': r.get('run_timestamp'),
323
+ 'start_at': r.get('start_at'),
324
+ 'end_at': r.get('end_at'),
325
+ 'last_recovered_at': r.get('last_recovered_at'),
326
+ 'recovery_count': r.get('recovery_count'),
327
+ 'job_duration': r.get('job_duration'),
328
+ 'failure_reason': r.get('failure_reason'),
329
+ 'job_id': r.get(spot_table.c.spot_job_id
330
+ ), # ambiguous, use table.column
331
+ 'task_id': r.get('task_id'),
332
+ 'task_name': r.get('task_name'),
333
+ 'specs': r.get('specs'),
334
+ 'local_log_file': r.get('local_log_file'),
335
+ 'metadata': r.get('metadata'),
336
336
  # columns from job_info table (some may be None for legacy jobs)
337
- '_job_info_job_id': r[job_info_table.c.spot_job_id
338
- ], # ambiguous, use table.column
339
- 'job_name': r['name'], # from job_info table
340
- 'schedule_state': r['schedule_state'],
341
- 'controller_pid': r['controller_pid'],
342
- 'dag_yaml_path': r['dag_yaml_path'],
343
- 'env_file_path': r['env_file_path'],
344
- 'user_hash': r['user_hash'],
345
- 'workspace': r['workspace'],
346
- 'priority': r['priority'],
347
- 'entrypoint': r['entrypoint'],
348
- 'original_user_yaml_path': r['original_user_yaml_path'],
349
- 'pool': r['pool'],
350
- 'current_cluster_name': r['current_cluster_name'],
351
- 'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
352
- 'pool_hash': r['pool_hash'],
337
+ '_job_info_job_id': r.get(job_info_table.c.spot_job_id
338
+ ), # ambiguous, use table.column
339
+ 'job_name': r.get('name'), # from job_info table
340
+ 'schedule_state': r.get('schedule_state'),
341
+ 'controller_pid': r.get('controller_pid'),
342
+ 'dag_yaml_path': r.get('dag_yaml_path'),
343
+ 'env_file_path': r.get('env_file_path'),
344
+ 'user_hash': r.get('user_hash'),
345
+ 'workspace': r.get('workspace'),
346
+ 'priority': r.get('priority'),
347
+ 'entrypoint': r.get('entrypoint'),
348
+ 'original_user_yaml_path': r.get('original_user_yaml_path'),
349
+ 'pool': r.get('pool'),
350
+ 'current_cluster_name': r.get('current_cluster_name'),
351
+ 'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
352
+ 'pool_hash': r.get('pool_hash'),
353
353
  }
354
354
 
355
355
 
@@ -1200,6 +1200,277 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
1200
1200
  return jobs
1201
1201
 
1202
1202
 
1203
+ def _map_response_field_to_db_column(field: str):
1204
+ """Map the response field name to an actual SQLAlchemy ColumnElement.
1205
+
1206
+ This ensures we never pass plain strings to SQLAlchemy 2.0 APIs like
1207
+ Select.with_only_columns().
1208
+ """
1209
+ # Explicit aliases differing from actual DB column names
1210
+ alias_mapping = {
1211
+ '_job_id': spot_table.c.job_id, # spot.job_id
1212
+ '_task_name': spot_table.c.job_name, # deprecated, from spot table
1213
+ 'job_id': spot_table.c.spot_job_id, # public job id -> spot.spot_job_id
1214
+ '_job_info_job_id': job_info_table.c.spot_job_id,
1215
+ 'job_name': job_info_table.c.name, # public job name -> job_info.name
1216
+ }
1217
+ if field in alias_mapping:
1218
+ return alias_mapping[field]
1219
+
1220
+ # Try direct match on the `spot` table columns
1221
+ if field in spot_table.c:
1222
+ return spot_table.c[field]
1223
+
1224
+ # Try direct match on the `job_info` table columns
1225
+ if field in job_info_table.c:
1226
+ return job_info_table.c[field]
1227
+
1228
+ raise ValueError(f'Unknown field: {field}')
1229
+
1230
+
1231
+ @_init_db
1232
+ def get_managed_jobs_total() -> int:
1233
+ """Get the total number of managed jobs."""
1234
+ assert _SQLALCHEMY_ENGINE is not None
1235
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1236
+ result = session.execute(
1237
+ sqlalchemy.select(sqlalchemy.func.count() # pylint: disable=not-callable
1238
+ ).select_from(spot_table)).fetchone()
1239
+ return result[0] if result else 0
1240
+
1241
+
1242
+ @_init_db
1243
+ def get_managed_jobs_highest_priority() -> int:
1244
+ """Get the highest priority of the managed jobs."""
1245
+ assert _SQLALCHEMY_ENGINE is not None
1246
+ query = sqlalchemy.select(sqlalchemy.func.max(
1247
+ job_info_table.c.priority)).where(
1248
+ sqlalchemy.and_(
1249
+ job_info_table.c.schedule_state.in_([
1250
+ ManagedJobScheduleState.LAUNCHING.value,
1251
+ ManagedJobScheduleState.ALIVE_BACKOFF.value,
1252
+ ManagedJobScheduleState.WAITING.value,
1253
+ ManagedJobScheduleState.ALIVE_WAITING.value,
1254
+ ]),
1255
+ job_info_table.c.priority.is_not(None),
1256
+ ))
1257
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1258
+ priority = session.execute(query).fetchone()
1259
+ return priority[0] if priority and priority[
1260
+ 0] is not None else constants.MIN_PRIORITY
1261
+
1262
+
1263
+ def build_managed_jobs_with_filters_no_status_query(
1264
+ fields: Optional[List[str]] = None,
1265
+ job_ids: Optional[List[int]] = None,
1266
+ accessible_workspaces: Optional[List[str]] = None,
1267
+ workspace_match: Optional[str] = None,
1268
+ name_match: Optional[str] = None,
1269
+ pool_match: Optional[str] = None,
1270
+ user_hashes: Optional[List[Optional[str]]] = None,
1271
+ skip_finished: bool = False,
1272
+ count_only: bool = False,
1273
+ status_count: bool = False,
1274
+ ) -> sqlalchemy.Select:
1275
+ """Build a query to get managed jobs from the database with filters."""
1276
+ # Join spot and job_info tables to get the job name for each task.
1277
+ # We use LEFT OUTER JOIN mainly for backward compatibility, as for an
1278
+ # existing controller before #1982, the job_info table may not exist,
1279
+ # and all the managed jobs created before will not present in the
1280
+ # job_info.
1281
+ # Note: we will get the user_hash here, but don't try to call
1282
+ # global_user_state.get_user() on it. This runs on the controller, which may
1283
+ # not have the user info. Prefer to do it on the API server side.
1284
+ if count_only:
1285
+ query = sqlalchemy.select(sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
1286
+ elif status_count:
1287
+ query = sqlalchemy.select(spot_table.c.status,
1288
+ sqlalchemy.func.count().label('count')) # pylint: disable=not-callable
1289
+ else:
1290
+ query = sqlalchemy.select(spot_table, job_info_table)
1291
+ query = query.select_from(
1292
+ spot_table.outerjoin(
1293
+ job_info_table,
1294
+ spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
1295
+ if skip_finished:
1296
+ # Filter out finished jobs at the DB level. If a multi-task job is
1297
+ # partially finished, include all its tasks. We do this by first
1298
+ # selecting job_ids that have at least one non-terminal task, then
1299
+ # restricting the main query to those job_ids.
1300
+ terminal_status_values = [
1301
+ s.value for s in ManagedJobStatus.terminal_statuses()
1302
+ ]
1303
+ non_terminal_job_ids_subquery = (sqlalchemy.select(
1304
+ spot_table.c.spot_job_id).where(
1305
+ sqlalchemy.or_(
1306
+ spot_table.c.status.is_(None),
1307
+ sqlalchemy.not_(
1308
+ spot_table.c.status.in_(terminal_status_values)),
1309
+ )).distinct())
1310
+ query = query.where(
1311
+ spot_table.c.spot_job_id.in_(non_terminal_job_ids_subquery))
1312
+ if not count_only and not status_count and fields:
1313
+ # Resolve requested field names to explicit ColumnElements from
1314
+ # the joined tables.
1315
+ selected_columns = [_map_response_field_to_db_column(f) for f in fields]
1316
+ query = query.with_only_columns(*selected_columns)
1317
+ if job_ids is not None:
1318
+ query = query.where(spot_table.c.spot_job_id.in_(job_ids))
1319
+ if accessible_workspaces is not None:
1320
+ query = query.where(
1321
+ job_info_table.c.workspace.in_(accessible_workspaces))
1322
+ if workspace_match is not None:
1323
+ query = query.where(
1324
+ job_info_table.c.workspace.like(f'%{workspace_match}%'))
1325
+ if name_match is not None:
1326
+ query = query.where(job_info_table.c.name.like(f'%{name_match}%'))
1327
+ if pool_match is not None:
1328
+ query = query.where(job_info_table.c.pool.like(f'%{pool_match}%'))
1329
+ if user_hashes is not None:
1330
+ query = query.where(job_info_table.c.user_hash.in_(user_hashes))
1331
+ return query
1332
+
1333
+
1334
+ def build_managed_jobs_with_filters_query(
1335
+ fields: Optional[List[str]] = None,
1336
+ job_ids: Optional[List[int]] = None,
1337
+ accessible_workspaces: Optional[List[str]] = None,
1338
+ workspace_match: Optional[str] = None,
1339
+ name_match: Optional[str] = None,
1340
+ pool_match: Optional[str] = None,
1341
+ user_hashes: Optional[List[Optional[str]]] = None,
1342
+ statuses: Optional[List[str]] = None,
1343
+ skip_finished: bool = False,
1344
+ count_only: bool = False,
1345
+ ) -> sqlalchemy.Select:
1346
+ """Build a query to get managed jobs from the database with filters."""
1347
+ query = build_managed_jobs_with_filters_no_status_query(
1348
+ fields=fields,
1349
+ job_ids=job_ids,
1350
+ accessible_workspaces=accessible_workspaces,
1351
+ workspace_match=workspace_match,
1352
+ name_match=name_match,
1353
+ pool_match=pool_match,
1354
+ user_hashes=user_hashes,
1355
+ skip_finished=skip_finished,
1356
+ count_only=count_only,
1357
+ )
1358
+ if statuses is not None:
1359
+ query = query.where(spot_table.c.status.in_(statuses))
1360
+ return query
1361
+
1362
+
1363
+ @_init_db
1364
+ def get_status_count_with_filters(
1365
+ fields: Optional[List[str]] = None,
1366
+ job_ids: Optional[List[int]] = None,
1367
+ accessible_workspaces: Optional[List[str]] = None,
1368
+ workspace_match: Optional[str] = None,
1369
+ name_match: Optional[str] = None,
1370
+ pool_match: Optional[str] = None,
1371
+ user_hashes: Optional[List[Optional[str]]] = None,
1372
+ skip_finished: bool = False,
1373
+ ) -> Dict[str, int]:
1374
+ """Get the status count of the managed jobs with filters."""
1375
+ query = build_managed_jobs_with_filters_no_status_query(
1376
+ fields=fields,
1377
+ job_ids=job_ids,
1378
+ accessible_workspaces=accessible_workspaces,
1379
+ workspace_match=workspace_match,
1380
+ name_match=name_match,
1381
+ pool_match=pool_match,
1382
+ user_hashes=user_hashes,
1383
+ skip_finished=skip_finished,
1384
+ status_count=True,
1385
+ )
1386
+ query = query.group_by(spot_table.c.status)
1387
+ results: Dict[str, int] = {}
1388
+ assert _SQLALCHEMY_ENGINE is not None
1389
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1390
+ rows = session.execute(query).fetchall()
1391
+ for status_value, count in rows:
1392
+ # status_value is already a string (enum value)
1393
+ results[str(status_value)] = int(count)
1394
+ return results
1395
+
1396
+
1397
+ @_init_db
1398
+ def get_managed_jobs_with_filters(
1399
+ fields: Optional[List[str]] = None,
1400
+ job_ids: Optional[List[int]] = None,
1401
+ accessible_workspaces: Optional[List[str]] = None,
1402
+ workspace_match: Optional[str] = None,
1403
+ name_match: Optional[str] = None,
1404
+ pool_match: Optional[str] = None,
1405
+ user_hashes: Optional[List[Optional[str]]] = None,
1406
+ statuses: Optional[List[str]] = None,
1407
+ skip_finished: bool = False,
1408
+ page: Optional[int] = None,
1409
+ limit: Optional[int] = None,
1410
+ ) -> Tuple[List[Dict[str, Any]], int]:
1411
+ """Get managed jobs from the database with filters."""
1412
+ assert _SQLALCHEMY_ENGINE is not None
1413
+
1414
+ count_query = build_managed_jobs_with_filters_query(
1415
+ fields=None,
1416
+ job_ids=job_ids,
1417
+ accessible_workspaces=accessible_workspaces,
1418
+ workspace_match=workspace_match,
1419
+ name_match=name_match,
1420
+ pool_match=pool_match,
1421
+ user_hashes=user_hashes,
1422
+ statuses=statuses,
1423
+ skip_finished=skip_finished,
1424
+ count_only=True,
1425
+ )
1426
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1427
+ total = session.execute(count_query).fetchone()[0]
1428
+
1429
+ query = build_managed_jobs_with_filters_query(
1430
+ fields=fields,
1431
+ job_ids=job_ids,
1432
+ accessible_workspaces=accessible_workspaces,
1433
+ workspace_match=workspace_match,
1434
+ name_match=name_match,
1435
+ pool_match=pool_match,
1436
+ user_hashes=user_hashes,
1437
+ statuses=statuses,
1438
+ skip_finished=skip_finished,
1439
+ )
1440
+ query = query.order_by(spot_table.c.spot_job_id.desc(),
1441
+ spot_table.c.task_id.asc())
1442
+ if page is not None and limit is not None:
1443
+ query = query.offset((page - 1) * limit).limit(limit)
1444
+ rows = None
1445
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1446
+ rows = session.execute(query).fetchall()
1447
+ jobs = []
1448
+ for row in rows:
1449
+ job_dict = _get_jobs_dict(row._mapping) # pylint: disable=protected-access
1450
+ job_dict['status'] = ManagedJobStatus(job_dict['status'])
1451
+ if job_dict.get('schedule_state') is not None:
1452
+ job_dict['schedule_state'] = ManagedJobScheduleState(
1453
+ job_dict['schedule_state'])
1454
+ if job_dict.get('job_name') is None:
1455
+ job_dict['job_name'] = job_dict.get('task_name')
1456
+ if job_dict.get('metadata') is not None:
1457
+ job_dict['metadata'] = json.loads(job_dict['metadata'])
1458
+
1459
+ # Add user YAML content for managed jobs.
1460
+ yaml_path = job_dict.get('original_user_yaml_path')
1461
+ if (not fields or 'user_yaml' in fields) and yaml_path:
1462
+ try:
1463
+ with open(yaml_path, 'r', encoding='utf-8') as f:
1464
+ job_dict['user_yaml'] = f.read()
1465
+ except (FileNotFoundError, IOError, OSError):
1466
+ job_dict['user_yaml'] = None
1467
+ else:
1468
+ job_dict['user_yaml'] = None
1469
+
1470
+ jobs.append(job_dict)
1471
+ return jobs, total
1472
+
1473
+
1203
1474
  @_init_db
1204
1475
  def get_task_name(job_id: int, task_id: int) -> str:
1205
1476
  """Get the task name of a job."""
@@ -1278,25 +1549,6 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
1278
1549
  return pool[0] if pool else None
1279
1550
 
1280
1551
 
1281
- @_init_db
1282
- def get_pool_and_submit_info_from_job_ids(
1283
- job_ids: Set[int]
1284
- ) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
1285
- """Get the pool, cluster name, and job id on pool from job id"""
1286
- assert _SQLALCHEMY_ENGINE is not None
1287
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1288
- rows = session.execute(
1289
- sqlalchemy.select(
1290
- job_info_table.c.spot_job_id, job_info_table.c.pool,
1291
- job_info_table.c.current_cluster_name,
1292
- job_info_table.c.job_id_on_pool_cluster).where(
1293
- job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
1294
- return {
1295
- job_id: (pool, cluster_name, job_id_on_pool_cluster)
1296
- for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
1297
- }
1298
-
1299
-
1300
1552
  @_init_db
1301
1553
  def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
1302
1554
  """Set the current cluster name for a job."""