PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251029__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251029py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (68) hide show

sky/__init__.py +2 -2
sky/adaptors/aws.py +25 -7
sky/client/cli/command.py +47 -23
sky/clouds/aws.py +59 -11
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
sky/dashboard/out/_next/static/chunks/{webpack-485984ca04e021d0.js → webpack-e38d5319cd10a3a0.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/mounting_utils.py +32 -2
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +62 -67
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/scheduler.py +15 -2
sky/jobs/server/core.py +85 -13
sky/jobs/server/server.py +12 -11
sky/jobs/server/utils.py +28 -10
sky/jobs/state.py +216 -40
sky/jobs/utils.py +60 -22
sky/metrics/utils.py +18 -0
sky/schemas/api/responses.py +1 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/server/server.py +8 -7
sky/server/common.py +21 -15
sky/server/constants.py +1 -1
sky/server/daemons.py +23 -17
sky/server/requests/executor.py +7 -3
sky/server/requests/request_names.py +80 -0
sky/server/server.py +103 -35
sky/skylet/constants.py +6 -1
sky/skylet/events.py +7 -0
sky/skylet/services.py +18 -7
sky/ssh_node_pools/server.py +5 -4
sky/task.py +4 -42
sky/templates/kubernetes-ray.yml.j2 +1 -1
sky/templates/websocket_proxy.py +140 -12
sky/users/permission.py +4 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/resource_checker.py +4 -1
sky/utils/schemas.py +23 -4
sky/volumes/server/server.py +4 -3
sky/workspaces/server.py +7 -6
{skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +36 -36
{skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +67 -62
sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +0 -26
/sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{DabuSAKsc_y0wyJxpTIdQ → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251029.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0

sky/jobs/file_content_utils.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""Utilities for managing managed job file content.
+The helpers in this module fetch job file content (DAG YAML/env files) from the
+database-first storage added for managed jobs, transparently falling back to
+legacy on-disk paths when needed. Consumers should prefer the string-based
+helpers so controllers never have to rely on local disk state.
+"""
+import os
+from typing import Optional
+from sky import sky_logging
+from sky.jobs import state as managed_job_state
+logger = sky_logging.init_logger(__name__)
+def get_job_dag_content(job_id: int) -> Optional[str]:
+    """Get DAG YAML content for a job from database or disk.
+    Args:
+        job_id: The job ID
+    Returns:
+        DAG YAML content as string, or None if not found
+    """
+    file_info = managed_job_state.get_job_file_contents(job_id)
+    # Prefer content stored in the database
+    if file_info['dag_yaml_content'] is not None:
+        return file_info['dag_yaml_content']
+    # Fallback to disk path for backward compatibility
+    dag_yaml_path = file_info.get('dag_yaml_path')
+    if dag_yaml_path and os.path.exists(dag_yaml_path):
+        try:
+            with open(dag_yaml_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                logger.debug('Loaded DAG YAML from disk for job %s: %s', job_id,
+                             dag_yaml_path)
+                return content
+        except (FileNotFoundError, IOError, OSError) as e:
+            logger.warning(
+                f'Failed to read DAG YAML from disk {dag_yaml_path}: {e}')
+    logger.warning(f'DAG YAML content not found for job {job_id}')
+    return None
+def get_job_env_content(job_id: int) -> Optional[str]:
+    """Get environment file content for a job from database or disk.
+    Args:
+        job_id: The job ID
+    Returns:
+        Environment file content as string, or None if not found
+    """
+    file_info = managed_job_state.get_job_file_contents(job_id)
+    # Prefer content stored in the database
+    if file_info['env_file_content'] is not None:
+        return file_info['env_file_content']
+    # Fallback to disk path for backward compatibility
+    env_file_path = file_info.get('env_file_path')
+    if env_file_path and os.path.exists(env_file_path):
+        try:
+            with open(env_file_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                logger.debug('Loaded environment file from disk for job %s: %s',
+                             job_id, env_file_path)
+                return content
+        except (FileNotFoundError, IOError, OSError) as e:
+            logger.warning(
+                f'Failed to read environment file from disk {env_file_path}: '
+                f'{e}')
+    # Environment file is optional, so don't warn if not found
+    return None

sky/jobs/log_gc.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""Log garbage collection for managed jobs."""
+import asyncio
+from datetime import datetime
+import os
+import pathlib
+import shutil
+import time
+import anyio
+import filelock
+from sky import sky_logging
+from sky import skypilot_config
+from sky.jobs import constants as managed_job_constants
+from sky.jobs import state as managed_job_state
+from sky.jobs import utils as managed_job_utils
+from sky.utils import context
+from sky.utils import context_utils
+logger = sky_logging.init_logger(__name__)
+# Filelock for garbage collector leader election.
+_JOB_CONTROLLER_GC_LOCK_PATH = os.path.expanduser(
+    '~/.sky/locks/job_controller_gc.lock')
+_DEFAULT_TASK_LOGS_GC_RETENTION_HOURS = 24 * 7
+_DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS = 24 * 7
+_LEAST_FREQUENT_GC_INTERVAL_SECONDS = 3600
+_MOST_FREQUENT_GC_INTERVAL_SECONDS = 30
+def _next_gc_interval(retention_seconds: int) -> int:
+    """Get the next GC interval."""
+    # Run the GC at least per hour to ensure hourly accuracy and
+    # at most per 30 seconds (when retention_seconds is small) to
+    # avoid too frequent cleanup.
+    return max(min(retention_seconds, _LEAST_FREQUENT_GC_INTERVAL_SECONDS),
+               _MOST_FREQUENT_GC_INTERVAL_SECONDS)
+async def gc_controller_logs_for_job():
+    """Garbage collect job and controller logs."""
+    while True:
+        skypilot_config.reload_config()
+        controller_logs_retention = skypilot_config.get_nested(
+            ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
+            _DEFAULT_CONTROLLER_LOGS_GC_RETENTION_HOURS) * 3600
+        # Negative value disables the GC
+        if controller_logs_retention >= 0:
+            logger.info(f'GC controller logs for job: retention '
+                        f'{controller_logs_retention} seconds')
+            try:
+                finished = False
+                while not finished:
+                    finished = await _clean_controller_logs_with_retention(
+                        controller_logs_retention)
+            except asyncio.CancelledError:
+                logger.info('Managed jobs logs GC task cancelled')
+                break
+            except Exception as e:  # pylint: disable=broad-except
+                logger.error(f'Error GC controller logs for job: {e}',
+                             exc_info=True)
+        else:
+            logger.info('Controller logs GC is disabled')
+        interval = _next_gc_interval(controller_logs_retention)
+        logger.info('Next controller logs GC is scheduled after '
+                    f'{interval} seconds')
+        await asyncio.sleep(interval)
+async def gc_task_logs_for_job():
+    """Garbage collect task logs for job."""
+    while True:
+        skypilot_config.reload_config()
+        task_logs_retention = skypilot_config.get_nested(
+            ('jobs', 'controller', 'task_logs_gc_retention_hours'),
+            _DEFAULT_TASK_LOGS_GC_RETENTION_HOURS) * 3600
+        # Negative value disables the GC
+        if task_logs_retention >= 0:
+            logger.info('GC task logs for job: '
+                        f'retention {task_logs_retention} seconds')
+            try:
+                finished = False
+                while not finished:
+                    finished = await _clean_task_logs_with_retention(
+                        task_logs_retention)
+            except asyncio.CancelledError:
+                logger.info('Task logs GC task cancelled')
+                break
+            except Exception as e:  # pylint: disable=broad-except
+                logger.error(f'Error GC task logs for job: {e}', exc_info=True)
+        else:
+            logger.info('Controller logs GC is disabled')
+        interval = _next_gc_interval(task_logs_retention)
+        logger.info(f'Next task logs GC is scheduled after {interval} seconds')
+        await asyncio.sleep(_next_gc_interval(task_logs_retention))
+async def _clean_controller_logs_with_retention(retention_seconds: int,
+                                                batch_size: int = 100):
+    """Clean controller logs with retention.
+    Returns:
+        Whether the GC of this round has finished, False means there might
+        still be more controller logs to clean.
+    """
+    assert batch_size > 0, 'Batch size must be positive'
+    jobs = await managed_job_state.get_controller_logs_to_clean_async(
+        retention_seconds, batch_size=batch_size)
+    job_ids_to_update = []
+    for job in jobs:
+        job_ids_to_update.append(job['job_id'])
+        log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
+        cleaned_at = time.time()
+        if await anyio.Path(log_file).exists():
+            ts_str = datetime.fromtimestamp(cleaned_at).strftime(
+                '%Y-%m-%d %H:%M:%S')
+            msg = f'Controller log has been cleaned at {ts_str}.'
+            # Sync down logs will reference to this file directly, so we
+            # keep the file and delete the content.
+            # TODO(aylei): refactor sync down logs if the inode usage
+            # becomes an issue.
+            async with await anyio.open_file(log_file, 'w',
+                                             encoding='utf-8') as f:
+                await f.write(msg + '\n')
+    # Batch the update, the timestamp will be not accurate but it's okay.
+    await managed_job_state.set_controller_logs_cleaned_async(
+        job_ids=job_ids_to_update, logs_cleaned_at=time.time())
+    complete = len(jobs) < batch_size
+    logger.info(f'Cleaned {len(jobs)} controller logs with retention '
+                f'{retention_seconds} seconds, complete: {complete}')
+    return complete
+async def _clean_task_logs_with_retention(retention_seconds: int,
+                                          batch_size: int = 100):
+    """Clean task logs with retention.
+    Returns:
+        Whether the GC of this round has finished, False means there might
+        still be more task logs to clean.
+    """
+    assert batch_size > 0, 'Batch size must be positive'
+    tasks = await managed_job_state.get_task_logs_to_clean_async(
+        retention_seconds, batch_size=batch_size)
+    tasks_to_update = []
+    for task in tasks:
+        local_log_file = anyio.Path(task['local_log_file'])
+        # We assume the log directory has the following layout:
+        # task-id/
+        #   - run.log
+        #   - tasks/
+        #     - run.log
+        # and also remove the tasks directory on cleanup.
+        task_log_dir = local_log_file.parent.joinpath('tasks')
+        await local_log_file.unlink(missing_ok=True)
+        await context_utils.to_thread(shutil.rmtree,
+                                      str(task_log_dir),
+                                      ignore_errors=True)
+        # We have at least once semantic guarantee for the cleanup here.
+        tasks_to_update.append((task['job_id'], task['task_id']))
+    await managed_job_state.set_task_logs_cleaned_async(
+        tasks=list(tasks_to_update), logs_cleaned_at=time.time())
+    complete = len(tasks) < batch_size
+    logger.info(f'Cleaned {len(tasks)} task logs with retention '
+                f'{retention_seconds} seconds, complete: {complete}')
+    return complete
+@context.contextual_async
+async def run_log_gc():
+    """Run the log garbage collector."""
+    log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
+    os.makedirs(log_dir, exist_ok=True)
+    log_path = os.path.join(log_dir, 'garbage_collector.log')
+    # Remove previous log file
+    await anyio.Path(log_path).unlink(missing_ok=True)
+    ctx = context.get()
+    assert ctx is not None, 'Context is not initialized'
+    ctx.redirect_log(pathlib.Path(log_path))
+    gc_controller_logs_for_job_task = asyncio.create_task(
+        gc_controller_logs_for_job())
+    gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
+    await asyncio.gather(gc_controller_logs_for_job_task,
+                         gc_task_logs_for_job_task)
+def elect_for_log_gc():
+    """Use filelock to elect for the log garbage collector.
+    The log garbage collector runs in the controller process to avoid the
+    overhead of launching a new process and the lifecycle management, the
+    threads that does not elected as the log garbage collector just wait.
+    on the filelock and bring trivial overhead.
+    """
+    with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
+        asyncio.run(run_log_gc())

sky/jobs/scheduler.py CHANGED Viewed

@@ -263,6 +263,7 @@ def maybe_start_controllers(from_scheduler: bool = False) -> None:
             if started > 0:
                 logger.info(f'Started {started} controllers')
     except filelock.Timeout:
         # If we can't get the lock, just exit. The process holding the lock
         # should launch any pending jobs.
@@ -289,8 +290,20 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
             maybe_start_controllers(from_scheduler=True)
             return
-    state.scheduler_set_waiting(job_id, dag_yaml_path, original_user_yaml_path,
-                                env_file_path, priority)
+    with open(dag_yaml_path, 'r', encoding='utf-8') as dag_file:
+        dag_yaml_content = dag_file.read()
+    with open(original_user_yaml_path, 'r',
+              encoding='utf-8') as original_user_yaml_file:
+        original_user_yaml_content = original_user_yaml_file.read()
+    with open(env_file_path, 'r', encoding='utf-8') as env_file:
+        env_file_content = env_file.read()
+    logger.debug(f'Storing job {job_id} file contents in database '
+                 f'(DAG bytes={len(dag_yaml_content)}, '
+                 f'original user yaml bytes={len(original_user_yaml_content)}, '
+                 f'env bytes={len(env_file_content)}).')
+    state.scheduler_set_waiting(job_id, dag_yaml_content,
+                                original_user_yaml_content, env_file_content,
+                                priority)
     if state.get_ha_recovery_script(job_id) is None:
         # the run command is just the command that called scheduler
         run = (f'source {env_file_path} && '

sky/jobs/server/core.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """SDK functions for managed jobs."""
+import concurrent.futures
+import copy
 import ipaddress
 import os
 import pathlib
@@ -60,6 +62,35 @@ else:
 logger = sky_logging.init_logger(__name__)
+_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
+    'job_id',
+    'task_id',
+    'workspace',
+    'job_name',
+    'task_name',
+    'resources',
+    'submitted_at',
+    'end_at',
+    'job_duration',
+    'recovery_count',
+    'status',
+    'pool',
+    'current_cluster_name',
+    'job_id_on_pool_cluster',
+    'start_at',
+    'infra',
+    'cloud',
+    'region',
+    'zone',
+    'cluster_resources',
+    'schedule_state',
+    'details',
+    'failure_reason',
+    'metadata',
+    'user_name',
+    'user_hash',
+]
 def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
     """Upload files to the controller.
@@ -357,12 +388,15 @@ def launch(
         ) as original_user_yaml_path:
             original_user_yaml_path.write(user_dag_str_user_specified)
             original_user_yaml_path.flush()
-            for task_ in dag.tasks:
+            # Copy tasks to avoid race conditions when multiple threads modify
+            # the same dag object concurrently. Each thread needs its own copy.
+            dag_copy = copy.deepcopy(dag)
+            for task_ in dag_copy.tasks:
                 if job_rank is not None:
                     task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
                     task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
-            dag_utils.dump_chain_dag_to_yaml(dag, f.name)
+            dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
             vars_to_fill = {
                 'remote_original_user_yaml_path':
@@ -395,7 +429,8 @@ def launch(
             yaml_path = os.path.join(
                 managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
-                f'{name}-{dag_uuid}-{consolidation_mode_job_id}.yaml')
+                f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
+            )
             common_utils.fill_template(
                 managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
                 vars_to_fill,
@@ -403,7 +438,7 @@ def launch(
             controller_task = task_lib.Task.from_yaml(yaml_path)
             controller_task.set_resources(controller_resources)
-            controller_task.managed_job_dag = dag
+            controller_task.managed_job_dag = dag_copy
             # pylint: disable=protected-access
             controller_task._metadata = metadata
@@ -472,15 +507,49 @@ def launch(
         assert len(consolidation_mode_job_ids) == 1
         return _submit_one(consolidation_mode_job_ids[0])
-    ids = []
-    all_handle = None
-    for job_rank in range(num_jobs):
-        job_id = (consolidation_mode_job_ids[job_rank]
+    ids: List[int] = []
+    all_handle: Optional[backends.ResourceHandle] = None
+    if num_jobs == 1:
+        job_id = (consolidation_mode_job_ids[0]
                   if consolidation_mode_job_ids is not None else None)
-        jid, handle = _submit_one(job_id, job_rank, num_jobs=num_jobs)
+        jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
         assert jid is not None, (job_id, handle)
         ids.append(jid)
         all_handle = handle
+    else:
+        # Submit jobs in parallel using ThreadPoolExecutor
+        with concurrent.futures.ThreadPoolExecutor(
+                max_workers=min(num_jobs,
+                                os.cpu_count() or 1)) as executor:
+            # Submit jobs concurrently
+            future_to_rank = {}
+            for job_rank in range(num_jobs):
+                job_id = (consolidation_mode_job_ids[job_rank]
+                          if consolidation_mode_job_ids is not None else None)
+                future = executor.submit(_submit_one, job_id, job_rank,
+                                         num_jobs)
+                future_to_rank[future] = job_rank
+            # Collect results in order of job_rank to maintain consistent order.
+            results: List[Optional[Tuple[
+                int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
+            for future in concurrent.futures.as_completed(future_to_rank):
+                job_rank = future_to_rank[future]
+                try:
+                    jid, handle = future.result()
+                    assert jid is not None, (job_id, handle)
+                    results[job_rank] = (jid, handle)
+                    all_handle = handle  # Keep the last handle.
+                except Exception as e:
+                    logger.error(f'Error launching job {job_rank}: {e}')
+                    raise e
+            # Extract job IDs in order
+            for res in results:
+                if res is not None:
+                    ids.append(res[0])
     return ids, all_handle
@@ -533,7 +602,8 @@ def queue_from_kubernetes_pod(
         'kubernetes', cluster_info)[0]
     code = managed_job_utils.ManagedJobCodeGen.get_job_table(
-        skip_finished=skip_finished)
+        skip_finished=skip_finished,
+        fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
     returncode, job_table_payload, stderr = managed_jobs_runner.run(
         code,
         require_outputs=True,
@@ -646,8 +716,7 @@ def queue(refresh: bool,
             does not exist.
         RuntimeError: if failed to get the managed jobs with ssh.
     """
-    jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids, None,
-                             None, None, None, None, None, None)
+    jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
     return jobs
@@ -764,7 +833,8 @@ def queue_v2(
         try:
             request = managed_jobsv1_pb2.GetJobTableRequest(
                 skip_finished=skip_finished,
-                accessible_workspaces=accessible_workspaces,
+                accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
+                    workspaces=accessible_workspaces)),
                 job_ids=managed_jobsv1_pb2.JobIds(
                     ids=job_ids) if job_ids is not None else None,
                 workspace_match=workspace_match,
@@ -780,6 +850,8 @@ def queue_v2(
                 ]) if user_hashes is not None else None,
                 statuses=managed_jobsv1_pb2.Statuses(
                     statuses=statuses) if statuses is not None else None,
+                fields=managed_jobsv1_pb2.Fields(
+                    fields=fields) if fields is not None else None,
                 show_jobs_without_user_hash=show_jobs_without_user_hash,
             )
             response = backend_utils.invoke_skylet_with_retries(

sky/jobs/server/server.py CHANGED Viewed

@@ -11,6 +11,7 @@ from sky.server import common as server_common
 from sky.server import stream_utils
 from sky.server.requests import executor
 from sky.server.requests import payloads
+from sky.server.requests import request_names
 from sky.server.requests import requests as api_requests
 from sky.skylet import constants
 from sky.utils import common
@@ -37,7 +38,7 @@ async def launch(request: fastapi.Request,
                      if consolidation_mode else api_requests.ScheduleType.LONG)
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.launch',
+        request_name=request_names.RequestName.JOBS_LAUNCH,
         request_body=jobs_launch_body,
         func=core.launch,
         schedule_type=schedule_type,
@@ -52,7 +53,7 @@ async def queue(request: fastapi.Request,
                 jobs_queue_body: payloads.JobsQueueBody) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.queue',
+        request_name=request_names.RequestName.JOBS_QUEUE,
         request_body=jobs_queue_body,
         func=core.queue,
         schedule_type=(api_requests.ScheduleType.LONG if jobs_queue_body.refresh
@@ -66,7 +67,7 @@ async def queue_v2(request: fastapi.Request,
                    jobs_queue_body_v2: payloads.JobsQueueV2Body) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.queue_v2',
+        request_name=request_names.RequestName.JOBS_QUEUE_V2,
         request_body=jobs_queue_body_v2,
         func=core.queue_v2_api,
         schedule_type=(api_requests.ScheduleType.LONG
@@ -81,7 +82,7 @@ async def cancel(request: fastapi.Request,
                  jobs_cancel_body: payloads.JobsCancelBody) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.cancel',
+        request_name=request_names.RequestName.JOBS_CANCEL,
         request_body=jobs_cancel_body,
         func=core.cancel,
         schedule_type=api_requests.ScheduleType.SHORT,
@@ -103,7 +104,7 @@ async def logs(
         executor.check_request_thread_executor_available()
     request_task = await executor.prepare_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.logs',
+        request_name=request_names.RequestName.JOBS_LOGS,
         request_body=jobs_logs_body,
         func=core.tail_logs,
         schedule_type=schedule_type,
@@ -143,7 +144,7 @@ async def download_logs(
     jobs_download_logs_body.local_dir = str(logs_dir_on_api_server)
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.download_logs',
+        request_name=request_names.RequestName.JOBS_DOWNLOAD_LOGS,
         request_body=jobs_download_logs_body,
         func=core.download_logs,
         schedule_type=api_requests.ScheduleType.LONG
@@ -157,7 +158,7 @@ async def pool_apply(request: fastapi.Request,
                      jobs_pool_apply_body: payloads.JobsPoolApplyBody) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.pool_apply',
+        request_name=request_names.RequestName.JOBS_POOL_APPLY,
         request_body=jobs_pool_apply_body,
         func=core.pool_apply,
         schedule_type=api_requests.ScheduleType.LONG,
@@ -170,7 +171,7 @@ async def pool_down(request: fastapi.Request,
                     jobs_pool_down_body: payloads.JobsPoolDownBody) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.pool_down',
+        request_name=request_names.RequestName.JOBS_POOL_DOWN,
         request_body=jobs_pool_down_body,
         func=core.pool_down,
         schedule_type=api_requests.ScheduleType.SHORT,
@@ -184,7 +185,7 @@ async def pool_status(
         jobs_pool_status_body: payloads.JobsPoolStatusBody) -> None:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.pool_status',
+        request_name=request_names.RequestName.JOBS_POOL_STATUS,
         request_body=jobs_pool_status_body,
         func=core.pool_status,
         schedule_type=api_requests.ScheduleType.SHORT,
@@ -199,7 +200,7 @@ async def pool_tail_logs(
 ) -> fastapi.responses.StreamingResponse:
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.pool_logs',
+        request_name=request_names.RequestName.JOBS_POOL_LOGS,
         request_body=log_body,
         func=core.pool_tail_logs,
         schedule_type=api_requests.ScheduleType.SHORT,
@@ -235,7 +236,7 @@ async def pool_download_logs(
     download_logs_body.local_dir = str(logs_dir_on_api_server)
     await executor.schedule_request_async(
         request_id=request.state.request_id,
-        request_name='jobs.pool_sync_down_logs',
+        request_name=request_names.RequestName.JOBS_POOL_SYNC_DOWN_LOGS,
         request_body=download_logs_body,
         func=core.pool_sync_down_logs,
         schedule_type=api_requests.ScheduleType.SHORT,

sky/jobs/server/utils.py CHANGED Viewed

@@ -19,6 +19,11 @@ else:
     managed_jobsv1_pb2 = adaptors_common.LazyImport(
         'sky.schemas.generated.managed_jobsv1_pb2')
+_MANAGED_JOB_FIELDS_TO_GET = [
+    'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
+    'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
+]
 def check_version_mismatch_and_non_terminal_jobs() -> None:
     """Check if controller has version mismatch and non-terminal jobs exist.
@@ -50,7 +55,11 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
                     )).get_managed_job_controller_version(version_request))
             controller_version = version_response.controller_version
-            job_table_request = managed_jobsv1_pb2.GetJobTableRequest()
+            job_table_request = managed_jobsv1_pb2.GetJobTableRequest(
+                skip_finished=True,
+                fields=managed_jobsv1_pb2.Fields(
+                    fields=_MANAGED_JOB_FIELDS_TO_GET),
+            )
             job_table_response = backend_utils.invoke_skylet_with_retries(
                 lambda: cloud_vm_ray_backend.SkyletClient(
                     handle.get_grpc_channel()).get_managed_job_table(
@@ -62,7 +71,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
     if use_legacy:
         # Get controller version and raw job table
-        code = managed_job_utils.ManagedJobCodeGen.get_version_and_job_table()
+        code = managed_job_utils.ManagedJobCodeGen.get_version()
         returncode, output, stderr = backend.run_on_head(handle,
                                                          code,
@@ -72,7 +81,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
         if returncode != 0:
             logger.error(output + stderr)
-            raise ValueError('Failed to check controller version and jobs with '
+            raise ValueError('Failed to check controller version with '
                              f'returncode: {returncode}.\n{output + stderr}')
         # Parse the output to extract controller version (split only on first
@@ -80,19 +89,28 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
         output_parts = output.strip().split('\n', 1)
         # Extract controller version from first line
-        if len(output_parts) < 2 or not output_parts[0].startswith(
-                'controller_version:'):
+        if not output_parts[0].startswith('controller_version:'):
             raise ValueError(
                 f'Expected controller version in first line, got: {output}')
         controller_version = output_parts[0].split(':', 1)[1]
-        # Rest is job table payload (preserving any newlines within it)
-        job_table_payload = output_parts[1]
+        code = managed_job_utils.ManagedJobCodeGen.get_job_table(
+            skip_finished=True, fields=_MANAGED_JOB_FIELDS_TO_GET)
+        returncode, job_table_payload, stderr = backend.run_on_head(
+            handle,
+            code,
+            require_outputs=True,
+            stream_logs=False,
+            separate_stderr=True)
+        if returncode != 0:
+            logger.error(job_table_payload + stderr)
+            raise ValueError('Failed to fetch managed jobs with returncode: '
+                             f'{returncode}.\n{job_table_payload + stderr}')
-        # Load and filter jobs locally using existing method
-        jobs, _, _, _, _ = managed_job_utils.load_managed_job_queue(
-            job_table_payload)
+        jobs, _, _, _, _ = (
+            managed_job_utils.load_managed_job_queue(job_table_payload))
     # Process locally: check version match and filter non-terminal jobs
     version_matches = (controller_version == local_version or

skypilot-nightly 1.0.0.dev20251029__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251029py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl