PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251027py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (114) hide show

sky/__init__.py +2 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/backends/backend_utils.py +9 -6
sky/backends/cloud_vm_ray_backend.py +2 -3
sky/check.py +25 -13
sky/client/cli/command.py +52 -24
sky/cloud_stores.py +73 -0
sky/clouds/aws.py +59 -11
sky/core.py +7 -5
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
sky/dashboard/out/_next/static/chunks/2755.d6dc6d530fed0b61.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-e38d5319cd10a3a0.js} +1 -1
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +71 -2
sky/data/storage.py +166 -9
sky/global_user_state.py +14 -18
sky/jobs/constants.py +2 -0
sky/jobs/controller.py +62 -67
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/scheduler.py +15 -2
sky/jobs/server/core.py +85 -13
sky/jobs/server/server.py +14 -13
sky/jobs/server/utils.py +28 -10
sky/jobs/state.py +216 -40
sky/jobs/utils.py +65 -28
sky/metrics/utils.py +18 -0
sky/optimizer.py +1 -1
sky/provision/kubernetes/instance.py +88 -19
sky/provision/kubernetes/volume.py +2 -2
sky/schemas/api/responses.py +3 -5
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/replica_managers.py +2 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/server.py +8 -7
sky/server/common.py +21 -15
sky/server/constants.py +1 -1
sky/server/daemons.py +23 -17
sky/server/requests/executor.py +7 -3
sky/server/requests/payloads.py +2 -0
sky/server/requests/request_names.py +80 -0
sky/server/requests/requests.py +137 -102
sky/server/requests/serializers/decoders.py +0 -6
sky/server/requests/serializers/encoders.py +33 -6
sky/server/server.py +105 -36
sky/server/stream_utils.py +56 -13
sky/setup_files/dependencies.py +2 -0
sky/skylet/constants.py +6 -1
sky/skylet/events.py +7 -0
sky/skylet/services.py +18 -7
sky/ssh_node_pools/server.py +5 -4
sky/task.py +14 -42
sky/templates/kubernetes-ray.yml.j2 +1 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +140 -12
sky/users/permission.py +4 -1
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/context_utils.py +13 -1
sky/utils/db/migration_utils.py +1 -1
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/volumes/server/server.py +4 -3
sky/workspaces/server.py +7 -6
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/METADATA +53 -37
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/RECORD +106 -100
sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
/sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → 8ixeA0NVQJN8HUdijid8b}/_ssgManifest.js +0 -0
/sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251101.dist-info}/top_level.txt +0 -0

sky/jobs/state.py CHANGED Viewed

@@ -93,6 +93,7 @@ spot_table = sqlalchemy.Table(
     sqlalchemy.Column('specs', sqlalchemy.Text),
     sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
+    sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
 )
 job_info_table = sqlalchemy.Table(
@@ -108,6 +109,8 @@ job_info_table = sqlalchemy.Table(
                       server_default=None),
     sqlalchemy.Column('dag_yaml_path', sqlalchemy.Text),
     sqlalchemy.Column('env_file_path', sqlalchemy.Text),
+    sqlalchemy.Column('dag_yaml_content', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('env_file_content', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('user_hash', sqlalchemy.Text),
     sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('priority',
@@ -117,6 +120,9 @@ job_info_table = sqlalchemy.Table(
     sqlalchemy.Column('original_user_yaml_path',
                       sqlalchemy.Text,
                       server_default=None),
+    sqlalchemy.Column('original_user_yaml_content',
+                      sqlalchemy.Text,
+                      server_default=None),
     sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('current_cluster_name',
                       sqlalchemy.Text,
@@ -125,6 +131,9 @@ job_info_table = sqlalchemy.Table(
                       sqlalchemy.Integer,
                       server_default=None),
     sqlalchemy.Column('pool_hash', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('controller_logs_cleaned_at',
+                      sqlalchemy.Float,
+                      server_default=None),
 )
 ha_recovery_script_table = sqlalchemy.Table(
@@ -313,6 +322,8 @@ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
 # column names in the DB and it corresponds to the combined view
 # by joining the spot and job_info tables.
 def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
+    # WARNING: If you update these you may also need to update GetJobTable in
+    # the skylet ManagedJobsServiceImpl.
     return {
         '_job_id': r.get('job_id'),  # from spot table
         '_task_name': r.get('job_name'),  # deprecated, from spot table
@@ -339,13 +350,18 @@ def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
         'job_name': r.get('name'),  # from job_info table
         'schedule_state': r.get('schedule_state'),
         'controller_pid': r.get('controller_pid'),
+        # the _path columns are for backwards compatibility, use the _content
+        # columns instead
         'dag_yaml_path': r.get('dag_yaml_path'),
         'env_file_path': r.get('env_file_path'),
+        'dag_yaml_content': r.get('dag_yaml_content'),
+        'env_file_content': r.get('env_file_content'),
         'user_hash': r.get('user_hash'),
         'workspace': r.get('workspace'),
         'priority': r.get('priority'),
         'entrypoint': r.get('entrypoint'),
         'original_user_yaml_path': r.get('original_user_yaml_path'),
+        'original_user_yaml_content': r.get('original_user_yaml_content'),
         'pool': r.get('pool'),
         'current_cluster_name': r.get('current_cluster_name'),
         'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
@@ -1076,7 +1092,8 @@ def _get_all_task_ids_statuses(
 @_init_db
 def get_all_task_ids_names_statuses_logs(
-        job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
+    job_id: int
+) -> List[Tuple[int, str, ManagedJobStatus, str, Optional[float]]]:
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         id_names = session.execute(
@@ -1085,9 +1102,10 @@ def get_all_task_ids_names_statuses_logs(
                 spot_table.c.task_name,
                 spot_table.c.status,
                 spot_table.c.local_log_file,
+                spot_table.c.logs_cleaned_at,
             ).where(spot_table.c.spot_job_id == job_id).order_by(
                 spot_table.c.task_id.asc())).fetchall()
-        return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
+        return [(row[0], row[1], ManagedJobStatus(row[2]), row[3], row[4])
                 for row in id_names]
@@ -1152,8 +1170,8 @@ def get_failure_reason(job_id: int) -> Optional[str]:
 @_init_db
-def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
-    """Get managed jobs from the database."""
+def get_managed_job_tasks(job_id: int) -> List[Dict[str, Any]]:
+    """Get managed job tasks for a specific managed job id from the database."""
     assert _SQLALCHEMY_ENGINE is not None
     # Join spot and job_info tables to get the job name for each task.
@@ -1168,10 +1186,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
         spot_table.outerjoin(
             job_info_table,
             spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
-    if job_id is not None:
-        query = query.where(spot_table.c.spot_job_id == job_id)
-    query = query.order_by(spot_table.c.spot_job_id.desc(),
-                           spot_table.c.task_id.asc())
+    query = query.where(spot_table.c.spot_job_id == job_id)
+    query = query.order_by(spot_table.c.task_id.asc())
     rows = None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         rows = session.execute(query).fetchall()
@@ -1186,15 +1202,17 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
         job_dict['metadata'] = json.loads(job_dict['metadata'])
         # Add user YAML content for managed jobs.
-        yaml_path = job_dict.get('original_user_yaml_path')
-        if yaml_path:
-            try:
-                with open(yaml_path, 'r', encoding='utf-8') as f:
-                    job_dict['user_yaml'] = f.read()
-            except (FileNotFoundError, IOError, OSError):
-                job_dict['user_yaml'] = None
-        else:
-            job_dict['user_yaml'] = None
+        job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
+        if job_dict['user_yaml'] is None:
+            # Backwards compatibility - try to read from file path
+            yaml_path = job_dict.get('original_user_yaml_path')
+            if yaml_path:
+                try:
+                    with open(yaml_path, 'r', encoding='utf-8') as f:
+                        job_dict['user_yaml'] = f.read()
+                except (FileNotFoundError, IOError, OSError) as e:
+                    logger.debug('Failed to read original user YAML for job '
+                                 f'{job_id} from {yaml_path}: {e}')
         jobs.append(job_dict)
     return jobs
@@ -1408,7 +1426,13 @@ def get_managed_jobs_with_filters(
     page: Optional[int] = None,
     limit: Optional[int] = None,
 ) -> Tuple[List[Dict[str, Any]], int]:
-    """Get managed jobs from the database with filters."""
+    """Get managed jobs from the database with filters.
+    Returns:
+        A tuple containing
+         - the list of managed jobs
+         - the total number of managed jobs
+    """
     assert _SQLALCHEMY_ENGINE is not None
     count_query = build_managed_jobs_with_filters_query(
@@ -1447,7 +1471,8 @@ def get_managed_jobs_with_filters(
     jobs = []
     for row in rows:
         job_dict = _get_jobs_dict(row._mapping)  # pylint: disable=protected-access
-        job_dict['status'] = ManagedJobStatus(job_dict['status'])
+        if job_dict.get('status') is not None:
+            job_dict['status'] = ManagedJobStatus(job_dict['status'])
         if job_dict.get('schedule_state') is not None:
             job_dict['schedule_state'] = ManagedJobScheduleState(
                 job_dict['schedule_state'])
@@ -1457,15 +1482,22 @@ def get_managed_jobs_with_filters(
             job_dict['metadata'] = json.loads(job_dict['metadata'])
         # Add user YAML content for managed jobs.
-        yaml_path = job_dict.get('original_user_yaml_path')
-        if (not fields or 'user_yaml' in fields) and yaml_path:
-            try:
-                with open(yaml_path, 'r', encoding='utf-8') as f:
-                    job_dict['user_yaml'] = f.read()
-            except (FileNotFoundError, IOError, OSError):
-                job_dict['user_yaml'] = None
-        else:
-            job_dict['user_yaml'] = None
+        job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
+        if job_dict['user_yaml'] is None:
+            # Backwards compatibility - try to read from file path
+            yaml_path = job_dict.get('original_user_yaml_path')
+            if yaml_path:
+                try:
+                    with open(yaml_path, 'r', encoding='utf-8') as f:
+                        job_dict['user_yaml'] = f.read()
+                except (FileNotFoundError, IOError, OSError) as e:
+                    job_id = job_dict.get('job_id')
+                    if job_id is not None:
+                        logger.debug('Failed to read original user YAML for '
+                                     f'job {job_id} from {yaml_path}: {e}')
+                    else:
+                        logger.debug('Failed to read original user YAML from '
+                                     f'{yaml_path}: {e}')
         jobs.append(job_dict)
     return jobs, total
@@ -1511,9 +1543,9 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
 @_init_db
-def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
-                          original_user_yaml_path: str, env_file_path: str,
-                          priority: int):
+def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
+                          original_user_yaml_content: str,
+                          env_file_content: str, priority: int):
     """Do not call without holding the scheduler lock.
     Returns: Whether this is a recovery run or not.
@@ -1525,19 +1557,48 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         updated_count = session.query(job_info_table).filter(
-            sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
-        ).update({
-            job_info_table.c.schedule_state:
-                ManagedJobScheduleState.WAITING.value,
-            job_info_table.c.dag_yaml_path: dag_yaml_path,
-            job_info_table.c.original_user_yaml_path: original_user_yaml_path,
-            job_info_table.c.env_file_path: env_file_path,
-            job_info_table.c.priority: priority,
-        })
+            sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)).update({
+                job_info_table.c.schedule_state:
+                    ManagedJobScheduleState.WAITING.value,
+                job_info_table.c.dag_yaml_content: dag_yaml_content,
+                job_info_table.c.original_user_yaml_content:
+                    (original_user_yaml_content),
+                job_info_table.c.env_file_content: env_file_content,
+                job_info_table.c.priority: priority,
+            })
         session.commit()
         assert updated_count <= 1, (job_id, updated_count)
+@_init_db
+def get_job_file_contents(job_id: int) -> Dict[str, Optional[str]]:
+    """Return file information and stored contents for a managed job."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        row = session.execute(
+            sqlalchemy.select(
+                job_info_table.c.dag_yaml_path,
+                job_info_table.c.env_file_path,
+                job_info_table.c.dag_yaml_content,
+                job_info_table.c.env_file_content,
+            ).where(job_info_table.c.spot_job_id == job_id)).fetchone()
+    if row is None:
+        return {
+            'dag_yaml_path': None,
+            'env_file_path': None,
+            'dag_yaml_content': None,
+            'env_file_content': None,
+        }
+    return {
+        'dag_yaml_path': row[0],
+        'env_file_path': row[1],
+        'dag_yaml_content': row[2],
+        'env_file_content': row[3],
+    }
 @_init_db
 def get_pool_from_job_id(job_id: int) -> Optional[str]:
     """Get the pool from the job id."""
@@ -2331,3 +2392,118 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
         rows = session.execute(query).fetchall()
         job_ids = [row[0] for row in rows if row[0] is not None]
         return job_ids
+@_init_db_async
+async def get_task_logs_to_clean_async(retention_seconds: int,
+                                       batch_size) -> List[Dict[str, Any]]:
+    """Get the logs of job tasks to clean.
+    The logs of a task will only cleaned when:
+    - the job schedule state is DONE
+    - AND the end time of the task is older than the retention period
+    """
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        now = time.time()
+        result = await session.execute(
+            sqlalchemy.select(
+                spot_table.c.spot_job_id,
+                spot_table.c.task_id,
+                spot_table.c.local_log_file,
+            ).select_from(
+                spot_table.join(
+                    job_info_table,
+                    spot_table.c.spot_job_id == job_info_table.c.spot_job_id,
+                )).
+            where(
+                sqlalchemy.and_(
+                    job_info_table.c.schedule_state.is_(
+                        ManagedJobScheduleState.DONE.value),
+                    spot_table.c.end_at.isnot(None),
+                    spot_table.c.end_at < (now - retention_seconds),
+                    spot_table.c.logs_cleaned_at.is_(None),
+                    # The local log file is set AFTER the task is finished,
+                    # add this condition to ensure the entire log file has
+                    # been written.
+                    spot_table.c.local_log_file.isnot(None),
+                )).limit(batch_size))
+        rows = result.fetchall()
+        return [{
+            'job_id': row[0],
+            'task_id': row[1],
+            'local_log_file': row[2]
+        } for row in rows]
+@_init_db_async
+async def get_controller_logs_to_clean_async(
+        retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
+    """Get the controller logs to clean.
+    The controller logs will only cleaned when:
+    - the job schedule state is DONE
+    - AND the end time of the latest task is older than the retention period
+    """
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        now = time.time()
+        result = await session.execute(
+            sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
+                job_info_table.join(
+                    spot_table,
+                    job_info_table.c.spot_job_id == spot_table.c.spot_job_id,
+                )).where(
+                    sqlalchemy.and_(
+                        job_info_table.c.schedule_state.is_(
+                            ManagedJobScheduleState.DONE.value),
+                        spot_table.c.local_log_file.isnot(None),
+                        job_info_table.c.controller_logs_cleaned_at.is_(None),
+                    )).group_by(
+                        job_info_table.c.spot_job_id,
+                        job_info_table.c.current_cluster_name,
+                    ).having(
+                        sqlalchemy.func.max(
+                            spot_table.c.end_at).isnot(None),).having(
+                                sqlalchemy.func.max(spot_table.c.end_at) < (
+                                    now - retention_seconds)).limit(batch_size))
+        rows = result.fetchall()
+        return [{'job_id': row[0]} for row in rows]
+@_init_db_async
+async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
+                                      logs_cleaned_at: float):
+    """Set the task logs cleaned at."""
+    if not tasks:
+        return
+    # Deduplicate
+    task_keys = list(dict.fromkeys(tasks))
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        await session.execute(
+            sqlalchemy.update(spot_table).where(
+                sqlalchemy.tuple_(spot_table.c.spot_job_id,
+                                  spot_table.c.task_id).in_(task_keys)).values(
+                                      logs_cleaned_at=logs_cleaned_at))
+        await session.commit()
+@_init_db_async
+async def set_controller_logs_cleaned_async(job_ids: List[int],
+                                            logs_cleaned_at: float):
+    """Set the controller logs cleaned at."""
+    if not job_ids:
+        return
+    # Deduplicate
+    job_ids = list(dict.fromkeys(job_ids))
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        await session.execute(
+            sqlalchemy.update(job_info_table).where(
+                job_info_table.c.spot_job_id.in_(job_ids)).values(
+                    controller_logs_cleaned_at=logs_cleaned_at))
+        await session.commit()

sky/jobs/utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ ManagedJobCodeGen.
 """
 import asyncio
 import collections
-import datetime
+from datetime import datetime
 import enum
 import os
 import pathlib
@@ -195,8 +195,8 @@ def _validate_consolidation_mode_config(
                     'terminate the controller cluster first.'
                     f'{colorama.Style.RESET_ALL}')
     else:
-        all_jobs = managed_job_state.get_managed_jobs()
-        if all_jobs:
+        total_jobs = managed_job_state.get_managed_jobs_total()
+        if total_jobs > 0:
             nonterminal_jobs = (
                 managed_job_state.get_nonterminal_job_ids_by_name(
                     None, None, all_users=True))
@@ -211,7 +211,7 @@ def _validate_consolidation_mode_config(
             else:
                 logger.warning(
                     f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
-                    f'but there are {len(all_jobs)} jobs from previous '
+                    f'but there are {total_jobs} jobs from previous '
                     'consolidation mode. Reset the `jobs.controller.'
                     'consolidation_mode` to `true` and run `sky jobs queue` '
                     'to see those jobs. Switching to normal mode will '
@@ -266,6 +266,12 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
 def ha_recovery_for_consolidation_mode():
     """Recovery logic for HA mode."""
+    # Touch the signal file here to avoid conflict with
+    # update_managed_jobs_statuses. Although we run this first and then start
+    # the deamon, this function is also called in cancel_jobs_by_id.
+    signal_file = pathlib.Path(
+        constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
+    signal_file.touch()
     # No setup recovery is needed in consolidation mode, as the API server
     # already has all runtime installed. Directly start jobs recovery here.
     # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
@@ -276,7 +282,9 @@ def ha_recovery_for_consolidation_mode():
               encoding='utf-8') as f:
         start = time.time()
         f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
-        for job in managed_job_state.get_managed_jobs():
+        jobs, _ = managed_job_state.get_managed_jobs_with_filters(
+            fields=['job_id', 'controller_pid', 'schedule_state', 'status'])
+        for job in jobs:
             job_id = job['job_id']
             controller_pid = job['controller_pid']
@@ -312,6 +320,7 @@ def ha_recovery_for_consolidation_mode():
                         f'{datetime.datetime.now()}\n')
         f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
         f.write(f'Total recovery time: {time.time() - start} seconds\n')
+    signal_file.unlink()
 async def get_job_status(
@@ -456,7 +465,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         """
         managed_job_state.remove_ha_recovery_script(job_id)
         error_msg = None
-        tasks = managed_job_state.get_managed_jobs(job_id)
+        tasks = managed_job_state.get_managed_job_tasks(job_id)
         for task in tasks:
             pool = task.get('pool', None)
             if pool is None:
@@ -525,7 +534,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
     for job_id in job_ids:
         assert job_id is not None
-        tasks = managed_job_state.get_managed_jobs(job_id)
+        tasks = managed_job_state.get_managed_job_tasks(job_id)
         # Note: controller_pid and schedule_state are in the job_info table
         # which is joined to the spot table, so all tasks with the same job_id
         # will have the same value for these columns. This is what lets us just
@@ -545,9 +554,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
             # There are two cases where we could get a job that is DONE.
             # 1. At query time (get_jobs_to_check_status), the job was not yet
-            #    DONE, but since then (before get_managed_jobs is called) it has
-            #    hit a terminal status, marked itself done, and exited. This is
-            #    fine.
+            #    DONE, but since then (before get_managed_job_tasks is called)
+            #    it has hit a terminal status, marked itself done, and exited.
+            #    This is fine.
             # 2. The job is DONE, but in a non-terminal status. This is
             #    unexpected. For instance, the task status is RUNNING, but the
             #    job schedule_state is DONE.
@@ -901,6 +910,14 @@ def cancel_jobs_by_pool(pool_name: str,
     return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
+def controller_log_file_for_job(job_id: int,
+                                create_if_not_exists: bool = False) -> str:
+    log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
+    if create_if_not_exists:
+        os.makedirs(log_dir, exist_ok=True)
+    return os.path.join(log_dir, f'{job_id}.log')
 def stream_logs_by_id(job_id: int,
                       follow: bool = True,
                       tail: Optional[int] = None) -> Tuple[str, int]:
@@ -933,13 +950,20 @@ def stream_logs_by_id(job_id: int,
             if managed_job_status.is_failed():
                 job_msg = ('\nFailure reason: '
                            f'{managed_job_state.get_failure_reason(job_id)}')
-            log_file_exists = False
+            log_file_ever_existed = False
             task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
                 job_id)
             num_tasks = len(task_info)
-            for task_id, task_name, task_status, log_file in task_info:
+            for (task_id, task_name, task_status, log_file,
+                 logs_cleaned_at) in task_info:
                 if log_file:
-                    log_file_exists = True
+                    log_file_ever_existed = True
+                    if logs_cleaned_at is not None:
+                        ts_str = datetime.fromtimestamp(
+                            logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
+                        print(f'Task {task_name}({task_id}) log has been '
+                              f'cleaned at {ts_str}.')
+                        continue
                     task_str = (f'Task {task_name}({task_id})'
                                 if task_name else f'Task {task_id}')
                     if num_tasks > 1:
@@ -974,7 +998,7 @@ def stream_logs_by_id(job_id: int,
                                 f'{task_str} finished '
                                 f'(status: {task_status.value}).'),
                                   flush=True)
-            if log_file_exists:
+            if log_file_ever_existed:
                 # Add the "Job finished" message for terminal states
                 if managed_job_status.is_terminal():
                     print(ux_utils.finishing_message(
@@ -1202,7 +1226,8 @@ def stream_logs(job_id: Optional[int],
     if controller:
         if job_id is None:
             assert job_name is not None
-            managed_jobs = managed_job_state.get_managed_jobs()
+            managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
+                name_match=job_name, fields=['job_id', 'job_name', 'status'])
             # We manually filter the jobs by name, instead of using
             # get_nonterminal_job_ids_by_name, as with `controller=True`, we
             # should be able to show the logs for jobs in terminal states.
@@ -1225,9 +1250,7 @@ def stream_logs(job_id: Optional[int],
             job_id = managed_job_ids.pop()
         assert job_id is not None, (job_id, job_name)
-        controller_log_path = os.path.join(
-            os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
-            f'{job_id}.log')
+        controller_log_path = controller_log_file_for_job(job_id)
         job_status = None
         # Wait for the log file to be written
@@ -1378,9 +1401,11 @@ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
             new_fields.append('priority')
         if 'failure_reason' not in new_fields:
             new_fields.append('failure_reason')
-    if ('user_yaml' in new_fields and
-            'original_user_yaml_path' not in new_fields):
-        new_fields.append('original_user_yaml_path')
+    if 'user_yaml' in new_fields:
+        if 'original_user_yaml_path' not in new_fields:
+            new_fields.append('original_user_yaml_path')
+        if 'original_user_yaml_content' not in new_fields:
+            new_fields.append('original_user_yaml_content')
     if cluster_handle_required:
         if 'task_name' not in new_fields:
             new_fields.append('task_name')
@@ -1522,12 +1547,11 @@ def get_managed_job_queue(
             handle = cluster_name_to_handle.get(
                 cluster_name, None) if cluster_name is not None else None
             if isinstance(handle, backends.CloudVmRayResourceHandle):
-                resources_str = resources_utils.get_readable_resources_repr(
-                    handle, simplify=True)
-                resources_str_full = (
-                    resources_utils.get_readable_resources_repr(handle,
-                                                                simplify=False))
-                job['cluster_resources'] = resources_str
+                resources_str_simple, resources_str_full = (
+                    resources_utils.get_readable_resources_repr(
+                        handle, simplified_only=False))
+                assert resources_str_full is not None
+                job['cluster_resources'] = resources_str_simple
                 job['cluster_resources_full'] = resources_str_full
                 job['cloud'] = str(handle.launched_resources.cloud)
                 job['region'] = handle.launched_resources.region
@@ -2110,7 +2134,8 @@ def _job_proto_to_dict(
         # and Protobuf encodes int64 as decimal strings in JSON,
         # so we need to convert them back to ints.
         # https://protobuf.dev/programming-guides/json/#field-representation
-        if field.type == descriptor.FieldDescriptor.TYPE_INT64:
+        if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
+                job_dict.get(field.name) is not None):
             job_dict[field.name] = int(job_dict[field.name])
     job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
         job_dict['status'])
@@ -2265,6 +2290,18 @@ class ManagedJobCodeGen:
         """)
         return cls._build(code)
+    @classmethod
+    def get_version(cls) -> str:
+        """Generate code to get controller version."""
+        code = textwrap.dedent("""\
+        from sky.skylet import constants as controller_constants
+        # Get controller version
+        controller_version = controller_constants.SKYLET_VERSION
+        print(f"controller_version:{controller_version}", flush=True)
+        """)
+        return cls._build(code)
     @classmethod
     def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
         code = textwrap.dedent(f"""\

sky/metrics/utils.py CHANGED Viewed

@@ -143,6 +143,24 @@ SKY_APISERVER_REQUEST_RSS_INCR_BYTES = prom.Histogram(
     'RSS increment after requests', ['name'],
     buckets=_MEM_BUCKETS)
+SKY_APISERVER_WEBSOCKET_SSH_LATENCY_SECONDS = prom.Histogram(
+    'sky_apiserver_websocket_ssh_latency_seconds',
+    ('Time taken for ssh message to go from client to API server and back'
+     'to the client. This does not include: latency to reach the pod, '
+     'overhead from sending through the k8s port-forward tunnel, or '
+     'ssh server lag on the destination pod.'),
+    ['pid'],
+    buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.25,
+             0.35, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.5, 2.75, 3, 3.5, 4, 4.5,
+             5, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 25.0, 30.0, 35.0, 40.0, 45.0,
+             50.0, 55.0, 60.0, 80.0, 120.0, 140.0, 160.0, 180.0, 200.0, 220.0,
+             240.0, 260.0, 280.0, 300.0, 320.0, 340.0, 360.0, 380.0, 400.0,
+             420.0, 440.0, 460.0, 480.0, 500.0, 520.0, 540.0, 560.0, 580.0,
+             600.0, 620.0, 640.0, 660.0, 680.0, 700.0, 720.0, 740.0, 760.0,
+             780.0, 800.0, 820.0, 840.0, 860.0, 880.0, 900.0, 920.0, 940.0,
+             960.0, 980.0, 1000.0, float('inf')),
+)
 @contextlib.contextmanager
 def time_it(name: str, group: str = 'default'):

sky/optimizer.py CHANGED Viewed

@@ -1019,7 +1019,7 @@ class Optimizer:
                             if res.instance_type is not None
                         ])
                         candidate_str = resources_utils.format_resource(
-                            best_resources, simplify=True)
+                            best_resources, simplified_only=True)[0]
                         logger.info(
                             f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '

skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251101__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251027py3-none-any.whl → 1.0.0.dev20251101py3-none-any.whl