skypilot-nightly 1.0.0.dev20251029__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (68) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/client/cli/command.py +47 -23
  4. sky/clouds/aws.py +59 -11
  5. sky/dashboard/out/404.html +1 -1
  6. sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
  7. sky/dashboard/out/_next/static/chunks/{webpack-485984ca04e021d0.js → webpack-e38d5319cd10a3a0.js} +1 -1
  8. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  9. sky/dashboard/out/clusters/[cluster].html +1 -1
  10. sky/dashboard/out/clusters.html +1 -1
  11. sky/dashboard/out/config.html +1 -1
  12. sky/dashboard/out/index.html +1 -1
  13. sky/dashboard/out/infra/[context].html +1 -1
  14. sky/dashboard/out/infra.html +1 -1
  15. sky/dashboard/out/jobs/[job].html +1 -1
  16. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  17. sky/dashboard/out/jobs.html +1 -1
  18. sky/dashboard/out/users.html +1 -1
  19. sky/dashboard/out/volumes.html +1 -1
  20. sky/dashboard/out/workspace/new.html +1 -1
  21. sky/dashboard/out/workspaces/[name].html +1 -1
  22. sky/dashboard/out/workspaces.html +1 -1
  23. sky/data/mounting_utils.py +32 -2
  24. sky/jobs/constants.py +2 -0
  25. sky/jobs/controller.py +62 -67
  26. sky/jobs/file_content_utils.py +80 -0
  27. sky/jobs/log_gc.py +201 -0
  28. sky/jobs/scheduler.py +15 -2
  29. sky/jobs/server/core.py +85 -13
  30. sky/jobs/server/server.py +12 -11
  31. sky/jobs/server/utils.py +28 -10
  32. sky/jobs/state.py +216 -40
  33. sky/jobs/utils.py +60 -22
  34. sky/metrics/utils.py +18 -0
  35. sky/schemas/api/responses.py +1 -0
  36. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  37. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  38. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  39. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  40. sky/serve/server/server.py +8 -7
  41. sky/server/common.py +21 -15
  42. sky/server/constants.py +1 -1
  43. sky/server/daemons.py +23 -17
  44. sky/server/requests/executor.py +7 -3
  45. sky/server/requests/request_names.py +80 -0
  46. sky/server/server.py +103 -35
  47. sky/skylet/constants.py +6 -1
  48. sky/skylet/events.py +7 -0
  49. sky/skylet/services.py +18 -7
  50. sky/ssh_node_pools/server.py +5 -4
  51. sky/task.py +4 -42
  52. sky/templates/kubernetes-ray.yml.j2 +1 -1
  53. sky/templates/websocket_proxy.py +140 -12
  54. sky/users/permission.py +4 -1
  55. sky/utils/db/migration_utils.py +1 -1
  56. sky/utils/resource_checker.py +4 -1
  57. sky/utils/schemas.py +23 -4
  58. sky/volumes/server/server.py +4 -3
  59. sky/workspaces/server.py +7 -6
  60. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +36 -36
  61. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +67 -62
  62. sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +0 -26
  63. /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +0 -0
  64. /sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
  65. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
  66. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
  67. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
  68. {skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,80 @@
1
+ """Utilities for managing managed job file content.
2
+
3
+ The helpers in this module fetch job file content (DAG YAML/env files) from the
4
+ database-first storage added for managed jobs, transparently falling back to
5
+ legacy on-disk paths when needed. Consumers should prefer the string-based
6
+ helpers so controllers never have to rely on local disk state.
7
+ """
8
+
9
+ import os
10
+ from typing import Optional
11
+
12
+ from sky import sky_logging
13
+ from sky.jobs import state as managed_job_state
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+
18
+ def get_job_dag_content(job_id: int) -> Optional[str]:
19
+ """Get DAG YAML content for a job from database or disk.
20
+
21
+ Args:
22
+ job_id: The job ID
23
+
24
+ Returns:
25
+ DAG YAML content as string, or None if not found
26
+ """
27
+ file_info = managed_job_state.get_job_file_contents(job_id)
28
+
29
+ # Prefer content stored in the database
30
+ if file_info['dag_yaml_content'] is not None:
31
+ return file_info['dag_yaml_content']
32
+
33
+ # Fallback to disk path for backward compatibility
34
+ dag_yaml_path = file_info.get('dag_yaml_path')
35
+ if dag_yaml_path and os.path.exists(dag_yaml_path):
36
+ try:
37
+ with open(dag_yaml_path, 'r', encoding='utf-8') as f:
38
+ content = f.read()
39
+ logger.debug('Loaded DAG YAML from disk for job %s: %s', job_id,
40
+ dag_yaml_path)
41
+ return content
42
+ except (FileNotFoundError, IOError, OSError) as e:
43
+ logger.warning(
44
+ f'Failed to read DAG YAML from disk {dag_yaml_path}: {e}')
45
+
46
+ logger.warning(f'DAG YAML content not found for job {job_id}')
47
+ return None
48
+
49
+
50
+ def get_job_env_content(job_id: int) -> Optional[str]:
51
+ """Get environment file content for a job from database or disk.
52
+
53
+ Args:
54
+ job_id: The job ID
55
+
56
+ Returns:
57
+ Environment file content as string, or None if not found
58
+ """
59
+ file_info = managed_job_state.get_job_file_contents(job_id)
60
+
61
+ # Prefer content stored in the database
62
+ if file_info['env_file_content'] is not None:
63
+ return file_info['env_file_content']
64
+
65
+ # Fallback to disk path for backward compatibility
66
+ env_file_path = file_info.get('env_file_path')
67
+ if env_file_path and os.path.exists(env_file_path):
68
+ try:
69
+ with open(env_file_path, 'r', encoding='utf-8') as f:
70
+ content = f.read()
71
+ logger.debug('Loaded environment file from disk for job %s: %s',
72
+ job_id, env_file_path)
73
+ return content
74
+ except (FileNotFoundError, IOError, OSError) as e:
75
+ logger.warning(
76
+ f'Failed to read environment file from disk {env_file_path}: '
77
+ f'{e}')
78
+
79
+ # Environment file is optional, so don't warn if not found
80
+ return None
sky/jobs/log_gc.py ADDED
@@ -0,0 +1,201 @@
1
+ """Log garbage collection for managed jobs."""
2
+
3
+ import asyncio
4
+ from datetime import datetime
5
+ import os
6
+ import pathlib
7
+ import shutil
8
+ import time
9
+
10
+ import anyio
11
+ import filelock
12
+
13
+ from sky import sky_logging
14
+ from sky import skypilot_config
15
+ from sky.jobs import constants as managed_job_constants
16
+ from sky.jobs import state as managed_job_state
17
+ from sky.jobs import utils as managed_job_utils
18
+ from sky.utils import context
19
+ from sky.utils import context_utils
20
+
21
+ logger = sky_logging.init_logger(__name__)
22
+
23
+ # Filelock for garbage collector leader election.
24
+ _JOB_CONTROLLER_GC_LOCK_PATH = os.path.expanduser(
25
+ '~/.sky/locks/job_controller_gc.lock')
26
+
27
+ _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS = 24 * 7
28
+ _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS = 24 * 7
29
+
30
+ _LEAST_FREQUENT_GC_INTERVAL_SECONDS = 3600
31
+ _MOST_FREQUENT_GC_INTERVAL_SECONDS = 30
32
+
33
+
34
+ def _next_gc_interval(retention_seconds: int) -> int:
35
+ """Get the next GC interval."""
36
+ # Run the GC at least per hour to ensure hourly accuracy and
37
+ # at most per 30 seconds (when retention_seconds is small) to
38
+ # avoid too frequent cleanup.
39
+ return max(min(retention_seconds, _LEAST_FREQUENT_GC_INTERVAL_SECONDS),
40
+ _MOST_FREQUENT_GC_INTERVAL_SECONDS)
41
+
42
+
43
+ async def gc_controller_logs_for_job():
44
+ """Garbage collect job and controller logs."""
45
+ while True:
46
+ skypilot_config.reload_config()
47
+ controller_logs_retention = skypilot_config.get_nested(
48
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
49
+ _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS) * 3600
50
+ # Negative value disables the GC
51
+ if controller_logs_retention >= 0:
52
+ logger.info(f'GC controller logs for job: retention '
53
+ f'{controller_logs_retention} seconds')
54
+ try:
55
+ finished = False
56
+ while not finished:
57
+ finished = await _clean_controller_logs_with_retention(
58
+ controller_logs_retention)
59
+ except asyncio.CancelledError:
60
+ logger.info('Managed jobs logs GC task cancelled')
61
+ break
62
+ except Exception as e: # pylint: disable=broad-except
63
+ logger.error(f'Error GC controller logs for job: {e}',
64
+ exc_info=True)
65
+ else:
66
+ logger.info('Controller logs GC is disabled')
67
+
68
+ interval = _next_gc_interval(controller_logs_retention)
69
+ logger.info('Next controller logs GC is scheduled after '
70
+ f'{interval} seconds')
71
+ await asyncio.sleep(interval)
72
+
73
+
74
+ async def gc_task_logs_for_job():
75
+ """Garbage collect task logs for job."""
76
+ while True:
77
+ skypilot_config.reload_config()
78
+ task_logs_retention = skypilot_config.get_nested(
79
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
80
+ _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS) * 3600
81
+ # Negative value disables the GC
82
+ if task_logs_retention >= 0:
83
+ logger.info('GC task logs for job: '
84
+ f'retention {task_logs_retention} seconds')
85
+ try:
86
+ finished = False
87
+ while not finished:
88
+ finished = await _clean_task_logs_with_retention(
89
+ task_logs_retention)
90
+ except asyncio.CancelledError:
91
+ logger.info('Task logs GC task cancelled')
92
+ break
93
+ except Exception as e: # pylint: disable=broad-except
94
+ logger.error(f'Error GC task logs for job: {e}', exc_info=True)
95
+ else:
96
+ logger.info('Controller logs GC is disabled')
97
+
98
+ interval = _next_gc_interval(task_logs_retention)
99
+ logger.info(f'Next task logs GC is scheduled after {interval} seconds')
100
+ await asyncio.sleep(_next_gc_interval(task_logs_retention))
101
+
102
+
103
+ async def _clean_controller_logs_with_retention(retention_seconds: int,
104
+ batch_size: int = 100):
105
+ """Clean controller logs with retention.
106
+
107
+ Returns:
108
+ Whether the GC of this round has finished, False means there might
109
+ still be more controller logs to clean.
110
+ """
111
+ assert batch_size > 0, 'Batch size must be positive'
112
+ jobs = await managed_job_state.get_controller_logs_to_clean_async(
113
+ retention_seconds, batch_size=batch_size)
114
+ job_ids_to_update = []
115
+ for job in jobs:
116
+ job_ids_to_update.append(job['job_id'])
117
+ log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
118
+ cleaned_at = time.time()
119
+ if await anyio.Path(log_file).exists():
120
+ ts_str = datetime.fromtimestamp(cleaned_at).strftime(
121
+ '%Y-%m-%d %H:%M:%S')
122
+ msg = f'Controller log has been cleaned at {ts_str}.'
123
+ # Sync down logs will reference to this file directly, so we
124
+ # keep the file and delete the content.
125
+ # TODO(aylei): refactor sync down logs if the inode usage
126
+ # becomes an issue.
127
+ async with await anyio.open_file(log_file, 'w',
128
+ encoding='utf-8') as f:
129
+ await f.write(msg + '\n')
130
+ # Batch the update, the timestamp will be not accurate but it's okay.
131
+ await managed_job_state.set_controller_logs_cleaned_async(
132
+ job_ids=job_ids_to_update, logs_cleaned_at=time.time())
133
+ complete = len(jobs) < batch_size
134
+ logger.info(f'Cleaned {len(jobs)} controller logs with retention '
135
+ f'{retention_seconds} seconds, complete: {complete}')
136
+ return complete
137
+
138
+
139
+ async def _clean_task_logs_with_retention(retention_seconds: int,
140
+ batch_size: int = 100):
141
+ """Clean task logs with retention.
142
+
143
+ Returns:
144
+ Whether the GC of this round has finished, False means there might
145
+ still be more task logs to clean.
146
+ """
147
+ assert batch_size > 0, 'Batch size must be positive'
148
+ tasks = await managed_job_state.get_task_logs_to_clean_async(
149
+ retention_seconds, batch_size=batch_size)
150
+ tasks_to_update = []
151
+ for task in tasks:
152
+ local_log_file = anyio.Path(task['local_log_file'])
153
+ # We assume the log directory has the following layout:
154
+ # task-id/
155
+ # - run.log
156
+ # - tasks/
157
+ # - run.log
158
+ # and also remove the tasks directory on cleanup.
159
+ task_log_dir = local_log_file.parent.joinpath('tasks')
160
+ await local_log_file.unlink(missing_ok=True)
161
+ await context_utils.to_thread(shutil.rmtree,
162
+ str(task_log_dir),
163
+ ignore_errors=True)
164
+ # We have at least once semantic guarantee for the cleanup here.
165
+ tasks_to_update.append((task['job_id'], task['task_id']))
166
+ await managed_job_state.set_task_logs_cleaned_async(
167
+ tasks=list(tasks_to_update), logs_cleaned_at=time.time())
168
+ complete = len(tasks) < batch_size
169
+ logger.info(f'Cleaned {len(tasks)} task logs with retention '
170
+ f'{retention_seconds} seconds, complete: {complete}')
171
+ return complete
172
+
173
+
174
+ @context.contextual_async
175
+ async def run_log_gc():
176
+ """Run the log garbage collector."""
177
+ log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
178
+ os.makedirs(log_dir, exist_ok=True)
179
+ log_path = os.path.join(log_dir, 'garbage_collector.log')
180
+ # Remove previous log file
181
+ await anyio.Path(log_path).unlink(missing_ok=True)
182
+ ctx = context.get()
183
+ assert ctx is not None, 'Context is not initialized'
184
+ ctx.redirect_log(pathlib.Path(log_path))
185
+ gc_controller_logs_for_job_task = asyncio.create_task(
186
+ gc_controller_logs_for_job())
187
+ gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
188
+ await asyncio.gather(gc_controller_logs_for_job_task,
189
+ gc_task_logs_for_job_task)
190
+
191
+
192
+ def elect_for_log_gc():
193
+ """Use filelock to elect for the log garbage collector.
194
+
195
+ The log garbage collector runs in the controller process to avoid the
196
+ overhead of launching a new process and the lifecycle management, the
197
+ threads that does not elected as the log garbage collector just wait.
198
+ on the filelock and bring trivial overhead.
199
+ """
200
+ with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
201
+ asyncio.run(run_log_gc())
sky/jobs/scheduler.py CHANGED
@@ -263,6 +263,7 @@ def maybe_start_controllers(from_scheduler: bool = False) -> None:
263
263
 
264
264
  if started > 0:
265
265
  logger.info(f'Started {started} controllers')
266
+
266
267
  except filelock.Timeout:
267
268
  # If we can't get the lock, just exit. The process holding the lock
268
269
  # should launch any pending jobs.
@@ -289,8 +290,20 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
289
290
  maybe_start_controllers(from_scheduler=True)
290
291
  return
291
292
 
292
- state.scheduler_set_waiting(job_id, dag_yaml_path, original_user_yaml_path,
293
- env_file_path, priority)
293
+ with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
294
+ dag_yaml_content = dag_file.read()
295
+ with open(original_user_yaml_path, 'r',
296
+ encoding='utf-8') as original_user_yaml_file:
297
+ original_user_yaml_content = original_user_yaml_file.read()
298
+ with open(env_file_path, 'r', encoding='utf-8') as env_file:
299
+ env_file_content = env_file.read()
300
+ logger.debug(f'Storing job {job_id} file contents in database '
301
+ f'(DAG bytes={len(dag_yaml_content)}, '
302
+ f'original user yaml bytes={len(original_user_yaml_content)}, '
303
+ f'env bytes={len(env_file_content)}).')
304
+ state.scheduler_set_waiting(job_id, dag_yaml_content,
305
+ original_user_yaml_content, env_file_content,
306
+ priority)
294
307
  if state.get_ha_recovery_script(job_id) is None:
295
308
  # the run command is just the command that called scheduler
296
309
  run = (f'source {env_file_path} && '
sky/jobs/server/core.py CHANGED
@@ -1,4 +1,6 @@
1
1
  """SDK functions for managed jobs."""
2
+ import concurrent.futures
3
+ import copy
2
4
  import ipaddress
3
5
  import os
4
6
  import pathlib
@@ -60,6 +62,35 @@ else:
60
62
 
61
63
  logger = sky_logging.init_logger(__name__)
62
64
 
65
+ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
66
+ 'job_id',
67
+ 'task_id',
68
+ 'workspace',
69
+ 'job_name',
70
+ 'task_name',
71
+ 'resources',
72
+ 'submitted_at',
73
+ 'end_at',
74
+ 'job_duration',
75
+ 'recovery_count',
76
+ 'status',
77
+ 'pool',
78
+ 'current_cluster_name',
79
+ 'job_id_on_pool_cluster',
80
+ 'start_at',
81
+ 'infra',
82
+ 'cloud',
83
+ 'region',
84
+ 'zone',
85
+ 'cluster_resources',
86
+ 'schedule_state',
87
+ 'details',
88
+ 'failure_reason',
89
+ 'metadata',
90
+ 'user_name',
91
+ 'user_hash',
92
+ ]
93
+
63
94
 
64
95
  def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
65
96
  """Upload files to the controller.
@@ -357,12 +388,15 @@ def launch(
357
388
  ) as original_user_yaml_path:
358
389
  original_user_yaml_path.write(user_dag_str_user_specified)
359
390
  original_user_yaml_path.flush()
360
- for task_ in dag.tasks:
391
+ # Copy tasks to avoid race conditions when multiple threads modify
392
+ # the same dag object concurrently. Each thread needs its own copy.
393
+ dag_copy = copy.deepcopy(dag)
394
+ for task_ in dag_copy.tasks:
361
395
  if job_rank is not None:
362
396
  task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
363
397
  task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
364
398
 
365
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
399
+ dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
366
400
 
367
401
  vars_to_fill = {
368
402
  'remote_original_user_yaml_path':
@@ -395,7 +429,8 @@ def launch(
395
429
 
396
430
  yaml_path = os.path.join(
397
431
  managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
398
- f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml')
432
+ f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
433
+ )
399
434
  common_utils.fill_template(
400
435
  managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
401
436
  vars_to_fill,
@@ -403,7 +438,7 @@ def launch(
403
438
  controller_task = task_lib.Task.from_yaml(yaml_path)
404
439
  controller_task.set_resources(controller_resources)
405
440
 
406
- controller_task.managed_job_dag = dag
441
+ controller_task.managed_job_dag = dag_copy
407
442
  # pylint: disable=protected-access
408
443
  controller_task._metadata = metadata
409
444
 
@@ -472,15 +507,49 @@ def launch(
472
507
  assert len(consolidation_mode_job_ids) == 1
473
508
  return _submit_one(consolidation_mode_job_ids[0])
474
509
 
475
- ids = []
476
- all_handle = None
477
- for job_rank in range(num_jobs):
478
- job_id = (consolidation_mode_job_ids[job_rank]
510
+ ids: List[int] = []
511
+ all_handle: Optional[backends.ResourceHandle] = None
512
+
513
+ if num_jobs == 1:
514
+ job_id = (consolidation_mode_job_ids[0]
479
515
  if consolidation_mode_job_ids is not None else None)
480
- jid, handle = _submit_one(job_id, job_rank, num_jobs=num_jobs)
516
+ jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
481
517
  assert jid is not None, (job_id, handle)
482
518
  ids.append(jid)
483
519
  all_handle = handle
520
+ else:
521
+ # Submit jobs in parallel using ThreadPoolExecutor
522
+ with concurrent.futures.ThreadPoolExecutor(
523
+ max_workers=min(num_jobs,
524
+ os.cpu_count() or 1)) as executor:
525
+ # Submit jobs concurrently
526
+ future_to_rank = {}
527
+ for job_rank in range(num_jobs):
528
+ job_id = (consolidation_mode_job_ids[job_rank]
529
+ if consolidation_mode_job_ids is not None else None)
530
+ future = executor.submit(_submit_one, job_id, job_rank,
531
+ num_jobs)
532
+ future_to_rank[future] = job_rank
533
+
534
+ # Collect results in order of job_rank to maintain consistent order.
535
+ results: List[Optional[Tuple[
536
+ int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
537
+ for future in concurrent.futures.as_completed(future_to_rank):
538
+ job_rank = future_to_rank[future]
539
+ try:
540
+ jid, handle = future.result()
541
+ assert jid is not None, (job_id, handle)
542
+ results[job_rank] = (jid, handle)
543
+ all_handle = handle # Keep the last handle.
544
+ except Exception as e:
545
+ logger.error(f'Error launching job {job_rank}: {e}')
546
+ raise e
547
+
548
+ # Extract job IDs in order
549
+ for res in results:
550
+ if res is not None:
551
+ ids.append(res[0])
552
+
484
553
  return ids, all_handle
485
554
 
486
555
 
@@ -533,7 +602,8 @@ def queue_from_kubernetes_pod(
533
602
  'kubernetes', cluster_info)[0]
534
603
 
535
604
  code = managed_job_utils.ManagedJobCodeGen.get_job_table(
536
- skip_finished=skip_finished)
605
+ skip_finished=skip_finished,
606
+ fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
537
607
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
538
608
  code,
539
609
  require_outputs=True,
@@ -646,8 +716,7 @@ def queue(refresh: bool,
646
716
  does not exist.
647
717
  RuntimeError: if failed to get the managed jobs with ssh.
648
718
  """
649
- jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids, None,
650
- None, None, None, None, None, None)
719
+ jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
651
720
 
652
721
  return jobs
653
722
 
@@ -764,7 +833,8 @@ def queue_v2(
764
833
  try:
765
834
  request = managed_jobsv1_pb2.GetJobTableRequest(
766
835
  skip_finished=skip_finished,
767
- accessible_workspaces=accessible_workspaces,
836
+ accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
837
+ workspaces=accessible_workspaces)),
768
838
  job_ids=managed_jobsv1_pb2.JobIds(
769
839
  ids=job_ids) if job_ids is not None else None,
770
840
  workspace_match=workspace_match,
@@ -780,6 +850,8 @@ def queue_v2(
780
850
  ]) if user_hashes is not None else None,
781
851
  statuses=managed_jobsv1_pb2.Statuses(
782
852
  statuses=statuses) if statuses is not None else None,
853
+ fields=managed_jobsv1_pb2.Fields(
854
+ fields=fields) if fields is not None else None,
783
855
  show_jobs_without_user_hash=show_jobs_without_user_hash,
784
856
  )
785
857
  response = backend_utils.invoke_skylet_with_retries(
sky/jobs/server/server.py CHANGED
@@ -11,6 +11,7 @@ from sky.server import common as server_common
11
11
  from sky.server import stream_utils
12
12
  from sky.server.requests import executor
13
13
  from sky.server.requests import payloads
14
+ from sky.server.requests import request_names
14
15
  from sky.server.requests import requests as api_requests
15
16
  from sky.skylet import constants
16
17
  from sky.utils import common
@@ -37,7 +38,7 @@ async def launch(request: fastapi.Request,
37
38
  if consolidation_mode else api_requests.ScheduleType.LONG)
38
39
  await executor.schedule_request_async(
39
40
  request_id=request.state.request_id,
40
- request_name='jobs.launch',
41
+ request_name=request_names.RequestName.JOBS_LAUNCH,
41
42
  request_body=jobs_launch_body,
42
43
  func=core.launch,
43
44
  schedule_type=schedule_type,
@@ -52,7 +53,7 @@ async def queue(request: fastapi.Request,
52
53
  jobs_queue_body: payloads.JobsQueueBody) -> None:
53
54
  await executor.schedule_request_async(
54
55
  request_id=request.state.request_id,
55
- request_name='jobs.queue',
56
+ request_name=request_names.RequestName.JOBS_QUEUE,
56
57
  request_body=jobs_queue_body,
57
58
  func=core.queue,
58
59
  schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
@@ -66,7 +67,7 @@ async def queue_v2(request: fastapi.Request,
66
67
  jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
67
68
  await executor.schedule_request_async(
68
69
  request_id=request.state.request_id,
69
- request_name='jobs.queue_v2',
70
+ request_name=request_names.RequestName.JOBS_QUEUE_V2,
70
71
  request_body=jobs_queue_body_v2,
71
72
  func=core.queue_v2_api,
72
73
  schedule_type=(api_requests.ScheduleType.LONG
@@ -81,7 +82,7 @@ async def cancel(request: fastapi.Request,
81
82
  jobs_cancel_body: payloads.JobsCancelBody) -> None:
82
83
  await executor.schedule_request_async(
83
84
  request_id=request.state.request_id,
84
- request_name='jobs.cancel',
85
+ request_name=request_names.RequestName.JOBS_CANCEL,
85
86
  request_body=jobs_cancel_body,
86
87
  func=core.cancel,
87
88
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -103,7 +104,7 @@ async def logs(
103
104
  executor.check_request_thread_executor_available()
104
105
  request_task = await executor.prepare_request_async(
105
106
  request_id=request.state.request_id,
106
- request_name='jobs.logs',
107
+ request_name=request_names.RequestName.JOBS_LOGS,
107
108
  request_body=jobs_logs_body,
108
109
  func=core.tail_logs,
109
110
  schedule_type=schedule_type,
@@ -143,7 +144,7 @@ async def download_logs(
143
144
  jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
144
145
  await executor.schedule_request_async(
145
146
  request_id=request.state.request_id,
146
- request_name='jobs.download_logs',
147
+ request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
147
148
  request_body=jobs_download_logs_body,
148
149
  func=core.download_logs,
149
150
  schedule_type=api_requests.ScheduleType.LONG
@@ -157,7 +158,7 @@ async def pool_apply(request: fastapi.Request,
157
158
  jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
158
159
  await executor.schedule_request_async(
159
160
  request_id=request.state.request_id,
160
- request_name='jobs.pool_apply',
161
+ request_name=request_names.RequestName.JOBS_POOL_APPLY,
161
162
  request_body=jobs_pool_apply_body,
162
163
  func=core.pool_apply,
163
164
  schedule_type=api_requests.ScheduleType.LONG,
@@ -170,7 +171,7 @@ async def pool_down(request: fastapi.Request,
170
171
  jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
171
172
  await executor.schedule_request_async(
172
173
  request_id=request.state.request_id,
173
- request_name='jobs.pool_down',
174
+ request_name=request_names.RequestName.JOBS_POOL_DOWN,
174
175
  request_body=jobs_pool_down_body,
175
176
  func=core.pool_down,
176
177
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -184,7 +185,7 @@ async def pool_status(
184
185
  jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
185
186
  await executor.schedule_request_async(
186
187
  request_id=request.state.request_id,
187
- request_name='jobs.pool_status',
188
+ request_name=request_names.RequestName.JOBS_POOL_STATUS,
188
189
  request_body=jobs_pool_status_body,
189
190
  func=core.pool_status,
190
191
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -199,7 +200,7 @@ async def pool_tail_logs(
199
200
  ) -> fastapi.responses.StreamingResponse:
200
201
  await executor.schedule_request_async(
201
202
  request_id=request.state.request_id,
202
- request_name='jobs.pool_logs',
203
+ request_name=request_names.RequestName.JOBS_POOL_LOGS,
203
204
  request_body=log_body,
204
205
  func=core.pool_tail_logs,
205
206
  schedule_type=api_requests.ScheduleType.SHORT,
@@ -235,7 +236,7 @@ async def pool_download_logs(
235
236
  download_logs_body.local_dir = str(logs_dir_on_api_server)
236
237
  await executor.schedule_request_async(
237
238
  request_id=request.state.request_id,
238
- request_name='jobs.pool_sync_down_logs',
239
+ request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
239
240
  request_body=download_logs_body,
240
241
  func=core.pool_sync_down_logs,
241
242
  schedule_type=api_requests.ScheduleType.SHORT,
sky/jobs/server/utils.py CHANGED
@@ -19,6 +19,11 @@ else:
19
19
  managed_jobsv1_pb2 = adaptors_common.LazyImport(
20
20
  'sky.schemas.generated.managed_jobsv1_pb2')
21
21
 
22
+ _MANAGED_JOB_FIELDS_TO_GET = [
23
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
24
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
25
+ ]
26
+
22
27
 
23
28
  def check_version_mismatch_and_non_terminal_jobs() -> None:
24
29
  """Check if controller has version mismatch and non-terminal jobs exist.
@@ -50,7 +55,11 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
50
55
  )).get_managed_job_controller_version(version_request))
51
56
  controller_version = version_response.controller_version
52
57
 
53
- job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
58
+ job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
59
+ skip_finished=True,
60
+ fields=managed_jobsv1_pb2.Fields(
61
+ fields=_MANAGED_JOB_FIELDS_TO_GET),
62
+ )
54
63
  job_table_response = backend_utils.invoke_skylet_with_retries(
55
64
  lambda: cloud_vm_ray_backend.SkyletClient(
56
65
  handle.get_grpc_channel()).get_managed_job_table(
@@ -62,7 +71,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
62
71
 
63
72
  if use_legacy:
64
73
  # Get controller version and raw job table
65
- code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
74
+ code = managed_job_utils.ManagedJobCodeGen.get_version()
66
75
 
67
76
  returncode, output, stderr = backend.run_on_head(handle,
68
77
  code,
@@ -72,7 +81,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
72
81
 
73
82
  if returncode != 0:
74
83
  logger.error(output + stderr)
75
- raise ValueError('Failed to check controller version and jobs with '
84
+ raise ValueError('Failed to check controller version with '
76
85
  f'returncode: {returncode}.\n{output + stderr}')
77
86
 
78
87
  # Parse the output to extract controller version (split only on first
@@ -80,19 +89,28 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
80
89
  output_parts = output.strip().split('\n', 1)
81
90
 
82
91
  # Extract controller version from first line
83
- if len(output_parts) < 2 or not output_parts[0].startswith(
84
- 'controller_version:'):
92
+ if not output_parts[0].startswith('controller_version:'):
85
93
  raise ValueError(
86
94
  f'Expected controller version in first line, got: {output}')
87
95
 
88
96
  controller_version = output_parts[0].split(':', 1)[1]
89
97
 
90
- # Rest is job table payload (preserving any newlines within it)
91
- job_table_payload = output_parts[1]
98
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
99
+ skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
100
+ returncode, job_table_payload, stderr = backend.run_on_head(
101
+ handle,
102
+ code,
103
+ require_outputs=True,
104
+ stream_logs=False,
105
+ separate_stderr=True)
106
+
107
+ if returncode != 0:
108
+ logger.error(job_table_payload + stderr)
109
+ raise ValueError('Failed to fetch managed jobs with returncode: '
110
+ f'{returncode}.\n{job_table_payload + stderr}')
92
111
 
93
- # Load and filter jobs locally using existing method
94
- jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
95
- job_table_payload)
112
+ jobs, _, _, _, _ = (
113
+ managed_job_utils.load_managed_job_queue(job_table_payload))
96
114
 
97
115
  # Process locally: check version match and filter non-terminal jobs
98
116
  version_matches = (controller_version == local_version or