PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250814py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (127) hide show

sky/global_user_state.py CHANGED Viewed

@@ -6,6 +6,7 @@ Concepts:
 - Cluster handle: (non-user facing) an opaque backend handle for us to
   interact with a cluster.
 """
+import asyncio
 import enum
 import functools
 import json
@@ -51,6 +52,9 @@ _ALLOWED_CLOUDS_KEY_PREFIX = 'allowed_clouds_'
 _SQLALCHEMY_ENGINE: Optional[sqlalchemy.engine.Engine] = None
 _SQLALCHEMY_ENGINE_LOCK = threading.Lock()
+DEFAULT_CLUSTER_EVENT_RETENTION_HOURS = 24.0
+MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS = 3600
 Base = declarative.declarative_base()
 config_table = sqlalchemy.Table(
@@ -102,6 +106,9 @@ cluster_table = sqlalchemy.Table(
                       sqlalchemy.Text,
                       server_default=None),
     sqlalchemy.Column('is_managed', sqlalchemy.Integer, server_default='0'),
+    sqlalchemy.Column('provision_log_path',
+                      sqlalchemy.Text,
+                      server_default=None),
 )
 storage_table = sqlalchemy.Table(
@@ -161,6 +168,9 @@ cluster_history_table = sqlalchemy.Table(
                       sqlalchemy.Text,
                       server_default=None),
     sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('provision_log_path',
+                      sqlalchemy.Text,
+                      server_default=None),
 )
@@ -430,6 +440,17 @@ def get_user_by_name(username: str) -> List[models.User]:
     ]
+@_init_db
+def get_user_by_name_match(username_match: str) -> List[models.User]:
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.query(user_table).filter(
+            user_table.c.name.like(f'%{username_match}%')).all()
+    return [
+        models.User(id=row.id, name=row.name, created_at=row.created_at)
+        for row in rows
+    ]
 @_init_db
 def delete_user(user_id: str) -> None:
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
@@ -458,7 +479,8 @@ def add_or_update_cluster(cluster_name: str,
                           is_launch: bool = True,
                           config_hash: Optional[str] = None,
                           task_config: Optional[Dict[str, Any]] = None,
-                          is_managed: bool = False):
+                          is_managed: bool = False,
+                          provision_log_path: Optional[str] = None):
     """Adds or updates cluster_name -> cluster_handle mapping.
     Args:
@@ -473,6 +495,7 @@ def add_or_update_cluster(cluster_name: str,
         task_config: The config of the task being launched.
         is_managed: Whether the cluster is launched by the
             controller.
+        provision_log_path: Absolute path to provision.log, if available.
     """
     assert _SQLALCHEMY_ENGINE is not None
     # FIXME: launched_at will be changed when `sky launch -c` is called.
@@ -555,6 +578,10 @@ def add_or_update_cluster(cluster_name: str,
                                       if task_config else None,
                 'last_creation_command': last_use,
             })
+        if provision_log_path is not None:
+            conditional_values.update({
+                'provision_log_path': provision_log_path,
+            })
         if (_SQLALCHEMY_ENGINE.dialect.name ==
                 db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -618,6 +645,7 @@ def add_or_update_cluster(cluster_name: str,
             usage_intervals=pickle.dumps(usage_intervals),
             user_hash=user_hash,
             workspace=history_workspace,
+            provision_log_path=provision_log_path,
             **creation_info,
         )
         do_update_stmt = insert_stmnt.on_conflict_do_update(
@@ -633,6 +661,7 @@ def add_or_update_cluster(cluster_name: str,
                     pickle.dumps(usage_intervals),
                 cluster_history_table.c.user_hash: history_hash,
                 cluster_history_table.c.workspace: history_workspace,
+                cluster_history_table.c.provision_log_path: provision_log_path,
                 **creation_info,
             })
         session.execute(do_update_stmt)
@@ -731,6 +760,41 @@ def get_last_cluster_event(cluster_hash: str,
     return row.reason
+def cleanup_cluster_events_with_retention(retention_hours: float) -> None:
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        query = session.query(cluster_event_table).filter(
+            cluster_event_table.c.transitioned_at < time.time() -
+            retention_hours * 3600)
+        logger.debug(f'Deleting {query.count()} cluster events.')
+        query.delete()
+        session.commit()
+async def cluster_event_retention_daemon():
+    """Garbage collect cluster events periodically."""
+    while True:
+        logger.info('Running cluster event retention daemon...')
+        # Use the latest config.
+        skypilot_config.reload_config()
+        retention_hours = skypilot_config.get_nested(
+            ('api_server', 'cluster_event_retention_hours'),
+            DEFAULT_CLUSTER_EVENT_RETENTION_HOURS)
+        try:
+            if retention_hours >= 0:
+                cleanup_cluster_events_with_retention(retention_hours)
+        except asyncio.CancelledError:
+            logger.info('Cluster event retention daemon cancelled')
+            break
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error(f'Error running cluster event retention daemon: {e}')
+        # Run daemon at most once every hour to avoid too frequent cleanup.
+        sleep_amount = max(retention_hours * 3600,
+                           MIN_CLUSTER_EVENT_DAEMON_INTERVAL_SECONDS)
+        await asyncio.sleep(sleep_amount)
 def get_cluster_events(cluster_name: Optional[str], cluster_hash: Optional[str],
                        event_type: ClusterEventType) -> List[str]:
     """Returns the cluster events for the cluster.
@@ -798,6 +862,7 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
     assert _SQLALCHEMY_ENGINE is not None
     cluster_hash = _get_hash_for_existing_cluster(cluster_name)
     usage_intervals = _get_cluster_usage_intervals(cluster_hash)
+    provision_log_path = get_cluster_provision_log_path(cluster_name)
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         # usage_intervals is not None and not empty
@@ -808,6 +873,16 @@ def remove_cluster(cluster_name: str, terminate: bool) -> None:
             usage_intervals.append((start_time, end_time))
             _set_cluster_usage_intervals(cluster_hash, usage_intervals)
+        if provision_log_path:
+            assert cluster_hash is not None, cluster_name
+            session.query(cluster_history_table).filter_by(
+                cluster_hash=cluster_hash
+            ).filter(
+                cluster_history_table.c.provision_log_path.is_(None)
+            ).update({
+                cluster_history_table.c.provision_log_path: provision_log_path
+            })
         if terminate:
             session.query(cluster_table).filter_by(name=cluster_name).delete()
             session.query(cluster_event_table).filter_by(
@@ -915,6 +990,58 @@ def get_cluster_info(cluster_name: str) -> Optional[Dict[str, Any]]:
     return json.loads(row.metadata)
+@_init_db
+def get_cluster_provision_log_path(cluster_name: str) -> Optional[str]:
+    """Returns provision_log_path from clusters table, if recorded."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        row = session.query(cluster_table).filter_by(name=cluster_name).first()
+    if row is None:
+        return None
+    return getattr(row, 'provision_log_path', None)
+@_init_db
+def get_cluster_history_provision_log_path(cluster_name: str) -> Optional[str]:
+    """Returns provision_log_path from cluster_history for this name.
+    If the cluster currently exists, we use its hash. Otherwise, we look up
+    historical rows by name and choose the most recent one based on
+    usage_intervals.
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        # Try current cluster first (fast path)
+        cluster_hash = _get_hash_for_existing_cluster(cluster_name)
+        if cluster_hash is not None:
+            row = session.query(cluster_history_table).filter_by(
+                cluster_hash=cluster_hash).first()
+            if row is not None:
+                return getattr(row, 'provision_log_path', None)
+        # Fallback: search history by name and pick the latest by
+        # usage_intervals
+        rows = session.query(cluster_history_table).filter_by(
+            name=cluster_name).all()
+        if not rows:
+            return None
+        def latest_timestamp(usages_bin) -> int:
+            try:
+                intervals = pickle.loads(usages_bin)
+                # intervals: List[Tuple[int, Optional[int]]]
+                if not intervals:
+                    return -1
+                _, end = intervals[-1]
+                return end if end is not None else int(time.time())
+            except Exception:  # pylint: disable=broad-except
+                return -1
+        latest_row = max(rows,
+                         key=lambda r: latest_timestamp(r.usage_intervals))
+        return getattr(latest_row, 'provision_log_path', None)
 @_init_db
 def set_cluster_info(cluster_name: str, metadata: Dict[str, Any]) -> None:
     assert _SQLALCHEMY_ENGINE is not None

sky/jobs/constants.py CHANGED Viewed

@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
 # The version of the lib files that jobs/utils use. Whenever there is an API
 # change for the jobs/utils, we need to bump this version and update
 # job.utils.ManagedJobCodeGen to handle the version update.
-MANAGED_JOBS_VERSION = 8
+MANAGED_JOBS_VERSION = 9
 # The command for setting up the jobs dashboard on the controller. It firstly
 # checks if the systemd services are available, and if not (e.g., Kubernetes

sky/jobs/scheduler.py CHANGED Viewed

@@ -93,7 +93,7 @@ def _start_controller(job_id: int, dag_yaml_path: str, env_file_path: str,
     logger.debug(f'Job {job_id} started with pid {pid}')
-def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
+def maybe_schedule_next_jobs() -> None:
     """Determine if any managed jobs can be scheduled, and if so, schedule them.
     Here, "schedule" means to select job that is waiting, and allow it to
@@ -139,7 +139,7 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
         with filelock.FileLock(controller_utils.get_resources_lock_path(),
                                blocking=False):
             while True:
-                maybe_next_job = state.get_waiting_job(pool)
+                maybe_next_job = state.get_waiting_job()
                 if maybe_next_job is None:
                     # Nothing left to start, break from scheduling loop
                     break
@@ -158,22 +158,11 @@ def maybe_schedule_next_jobs(pool: Optional[str] = None) -> None:
                 # an ALIVE_WAITING job, but we would be able to launch a WAITING
                 # job.
                 if current_state == state.ManagedJobScheduleState.ALIVE_WAITING:
-                    if not (controller_utils.can_provision() or
-                            actual_pool is not None):
+                    if not controller_utils.can_provision():
                         # Can't schedule anything, break from scheduling loop.
                         break
                 elif current_state == state.ManagedJobScheduleState.WAITING:
                     if not _can_start_new_job(actual_pool):
-                        # If there is no job can be scheduled in the pool, we
-                        # try to schedule another job regardless of the pool.
-                        # This is to avoid the case where the pool is scaled
-                        # down at the same time as a job is done. In this case,
-                        # we won't have any job to schedule in the pool, but
-                        # other jobs in other pool (or no pool) can still be
-                        # scheduled.
-                        if pool is not None:
-                            pool = None
-                            continue
                         # Can't schedule anything, break from scheduling loop.
                         break
@@ -218,7 +207,7 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
     if is_resume:
         _start_controller(job_id, dag_yaml_path, env_file_path, pool)
     else:
-        maybe_schedule_next_jobs(pool)
+        maybe_schedule_next_jobs()
 @contextlib.contextmanager
@@ -243,6 +232,13 @@ def scheduled_launch(job_id: int):
     multiple uses of this context are nested, behavior is undefined. Don't do
     that.
     """
+    pool = state.get_pool_from_job_id(job_id)
+    # For pool, since there is no execution.launch, we don't need to have all
+    # the ALIVE_WAITING state. The state transition will be
+    # WAITING -> ALIVE -> DONE without any intermediate transitions.
+    if pool is not None:
+        yield
+        return
     # If we're already in LAUNCHING schedule_state, we don't need to wait.
     # This may be the case for the first launch of a job.
@@ -254,7 +250,6 @@ def scheduled_launch(job_id: int):
         while (state.get_job_schedule_state(job_id) !=
                state.ManagedJobScheduleState.LAUNCHING):
             time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
-    pool = state.get_pool_from_job_id(job_id)
     try:
         yield
@@ -268,7 +263,7 @@ def scheduled_launch(job_id: int):
         with filelock.FileLock(controller_utils.get_resources_lock_path()):
             state.scheduler_set_alive(job_id)
     finally:
-        maybe_schedule_next_jobs(pool)
+        maybe_schedule_next_jobs()
 def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -283,19 +278,17 @@ def job_done(job_id: int, idempotent: bool = False) -> None:
     if idempotent and (state.get_job_schedule_state(job_id)
                        == state.ManagedJobScheduleState.DONE):
         return
-    pool = state.get_pool_from_job_id(job_id)
     with filelock.FileLock(controller_utils.get_resources_lock_path()):
         state.scheduler_set_done(job_id, idempotent)
-    maybe_schedule_next_jobs(pool)
+    maybe_schedule_next_jobs()
 def _set_alive_waiting(job_id: int) -> None:
     """Should use wait_until_launch_okay() to transition to this state."""
     with filelock.FileLock(controller_utils.get_resources_lock_path()):
         state.scheduler_set_alive_waiting(job_id)
-    pool = state.get_pool_from_job_id(job_id)
-    maybe_schedule_next_jobs(pool)
+    maybe_schedule_next_jobs()
 def _can_start_new_job(pool: Optional[str]) -> bool:

sky/jobs/server/core.py CHANGED Viewed

@@ -497,7 +497,8 @@ def queue_from_kubernetes_pod(
     managed_jobs_runner = provision_lib.get_command_runners(
         'kubernetes', cluster_info)[0]
-    code = managed_job_utils.ManagedJobCodeGen.get_job_table()
+    code = managed_job_utils.ManagedJobCodeGen.get_job_table(
+        skip_finished=skip_finished)
     returncode, job_table_payload, stderr = managed_jobs_runner.run(
         code,
         require_outputs=True,
@@ -513,7 +514,14 @@ def queue_from_kubernetes_pod(
     except exceptions.CommandError as e:
         raise RuntimeError(str(e)) from e
-    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    jobs, _, result_type = managed_job_utils.load_managed_job_queue(
+        job_table_payload)
+    if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
+        return jobs
+    # Backward compatibility for old jobs controller without filtering
+    # TODO(hailong): remove this after 0.12.0
     if skip_finished:
         # Filter out the finished jobs. If a multi-task job is partially
         # finished, we will include all its tasks.
@@ -568,10 +576,18 @@ def _maybe_restart_controller(
 @usage_lib.entrypoint
-def queue(refresh: bool,
-          skip_finished: bool = False,
-          all_users: bool = False,
-          job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
+def queue(
+    refresh: bool,
+    skip_finished: bool = False,
+    all_users: bool = False,
+    job_ids: Optional[List[int]] = None,
+    user_match: Optional[str] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    page: Optional[int] = None,
+    limit: Optional[int] = None,
+) -> Tuple[List[Dict[str, Any]], int]:
     # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
     """Gets statuses of managed jobs.
@@ -601,6 +617,17 @@ def queue(refresh: bool,
             does not exist.
         RuntimeError: if failed to get the managed jobs with ssh.
     """
+    if limit is not None:
+        if limit < 1:
+            raise ValueError(f'Limit must be at least 1, got {limit}')
+        if page is None:
+            page = 1
+        if page < 1:
+            raise ValueError(f'Page must be at least 1, got {page}')
+    else:
+        if page is not None:
+            raise ValueError('Limit must be specified when page is specified')
     handle = _maybe_restart_controller(refresh,
                                        stopped_message='No in-progress '
                                        'managed jobs.',
@@ -609,7 +636,22 @@ def queue(refresh: bool,
     backend = backend_utils.get_backend_from_handle(handle)
     assert isinstance(backend, backends.CloudVmRayBackend)
-    code = managed_job_utils.ManagedJobCodeGen.get_job_table()
+    user_hashes: Optional[List[Optional[str]]] = None
+    if not all_users:
+        user_hashes = [common_utils.get_user_hash()]
+        # For backwards compatibility, we show jobs that do not have a
+        # user_hash. TODO(cooperc): Remove before 0.12.0.
+        user_hashes.append(None)
+    elif user_match is not None:
+        users = global_user_state.get_user_by_name_match(user_match)
+        if not users:
+            return [], 0
+        user_hashes = [user.id for user in users]
+    accessible_workspaces = list(workspaces_core.get_workspaces().keys())
+    code = managed_job_utils.ManagedJobCodeGen.get_job_table(
+        skip_finished, accessible_workspaces, job_ids, workspace_match,
+        name_match, pool_match, page, limit, user_hashes)
     returncode, job_table_payload, stderr = backend.run_on_head(
         handle,
         code,
@@ -622,8 +664,14 @@ def queue(refresh: bool,
         raise RuntimeError('Failed to fetch managed jobs with returncode: '
                            f'{returncode}.\n{job_table_payload + stderr}')
-    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    jobs, total, result_type = managed_job_utils.load_managed_job_queue(
+        job_table_payload)
+    if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
+        return jobs, total
+    # Backward compatibility for old jobs controller without filtering
+    # TODO(hailong): remove this after 0.12.0
     if not all_users:
         def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
@@ -636,7 +684,6 @@ def queue(refresh: bool,
         jobs = list(filter(user_hash_matches_or_missing, jobs))
-    accessible_workspaces = workspaces_core.get_workspaces()
     jobs = list(
         filter(
             lambda job: job.get('workspace', skylet_constants.
@@ -655,7 +702,14 @@ def queue(refresh: bool,
     if job_ids:
         jobs = [job for job in jobs if job['job_id'] in job_ids]
-    return jobs
+    return managed_job_utils.filter_jobs(jobs,
+                                         workspace_match,
+                                         name_match,
+                                         pool_match,
+                                         page=page,
+                                         limit=limit,
+                                         user_match=user_match,
+                                         enable_user_match=True)
 @usage_lib.entrypoint

sky/jobs/server/utils.py CHANGED Viewed

@@ -62,7 +62,7 @@ def check_version_mismatch_and_non_terminal_jobs() -> None:
     version_matches = controller_version == local_version
     # Load and filter jobs locally using existing method
-    jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
+    jobs, _, _ = managed_job_utils.load_managed_job_queue(job_table_payload)
     non_terminal_jobs = [job for job in jobs if not job['status'].is_terminal()]
     has_non_terminal_jobs = len(non_terminal_jobs) > 0

sky/jobs/state.py CHANGED Viewed

@@ -1528,7 +1528,7 @@ def get_nonterminal_job_ids_by_pool(pool: str,
 @_init_db
-def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
+def get_waiting_job() -> Optional[Dict[str, Any]]:
     """Get the next job that should transition to LAUNCHING.
     Selects the highest-priority WAITING or ALIVE_WAITING job, provided its
@@ -1559,8 +1559,6 @@ def get_waiting_job(pool: Optional[str]) -> Optional[Dict[str, Any]]:
             job_info_table.c.priority >= sqlalchemy.func.coalesce(
                 max_priority_subquery, 0),
         ]
-        if pool is not None:
-            select_conds.append(job_info_table.c.pool == pool)
         query = sqlalchemy.select(
             job_info_table.c.spot_job_id,
             job_info_table.c.schedule_state,

skypilot-nightly 1.0.0.dev20250814__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250814py3-none-any.whl → 1.0.0.dev20250815py3-none-any.whl