PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250624__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250623py3-none-any.whl → 1.0.0.dev20250624py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

sky/global_user_state.py CHANGED Viewed

@@ -134,6 +134,12 @@ cluster_history_table = sqlalchemy.Table(
     sqlalchemy.Column('launched_resources', sqlalchemy.LargeBinary),
     sqlalchemy.Column('usage_intervals', sqlalchemy.LargeBinary),
     sqlalchemy.Column('user_hash', sqlalchemy.Text),
+    sqlalchemy.Column('last_creation_yaml',
+                      sqlalchemy.Text,
+                      server_default=None),
+    sqlalchemy.Column('last_creation_command',
+                      sqlalchemy.Text,
+                      server_default=None),
 )
 ssh_key_table = sqlalchemy.Table(
@@ -308,6 +314,21 @@ def create_table():
             'password',
             sqlalchemy.Text(),
             default_statement='DEFAULT NULL')
+        db_utils.add_column_to_table_sqlalchemy(
+            session,
+            'cluster_history',
+            'last_creation_yaml',
+            sqlalchemy.Text(),
+            default_statement='DEFAULT NULL')
+        db_utils.add_column_to_table_sqlalchemy(
+            session,
+            'cluster_history',
+            'last_creation_command',
+            sqlalchemy.Text(),
+            default_statement='DEFAULT NULL')
         session.commit()
@@ -597,6 +618,14 @@ def add_or_update_cluster(cluster_name: str,
         # Modify cluster history table
         launched_nodes = getattr(cluster_handle, 'launched_nodes', None)
         launched_resources = getattr(cluster_handle, 'launched_resources', None)
+        creation_info = {}
+        if conditional_values.get('last_creation_yaml') is not None:
+            creation_info = {
+                'last_creation_yaml':
+                    conditional_values.get('last_creation_yaml'),
+                'last_creation_command':
+                    conditional_values.get('last_creation_command'),
+            }
         insert_stmnt = insert_func(cluster_history_table).values(
             cluster_hash=cluster_hash,
@@ -605,7 +634,9 @@ def add_or_update_cluster(cluster_name: str,
             requested_resources=pickle.dumps(requested_resources),
             launched_resources=pickle.dumps(launched_resources),
             usage_intervals=pickle.dumps(usage_intervals),
-            user_hash=user_hash)
+            user_hash=user_hash,
+            **creation_info,
+        )
         do_update_stmt = insert_stmnt.on_conflict_do_update(
             index_elements=[cluster_history_table.c.cluster_hash],
             set_={
@@ -617,7 +648,8 @@ def add_or_update_cluster(cluster_name: str,
                     pickle.dumps(launched_resources),
                 cluster_history_table.c.usage_intervals:
                     pickle.dumps(usage_intervals),
-                cluster_history_table.c.user_hash: user_hash
+                cluster_history_table.c.user_hash: user_hash,
+                **creation_info,
             })
         session.execute(do_update_stmt)
@@ -1027,40 +1059,122 @@ def get_clusters() -> List[Dict[str, Any]]:
 @_init_db
-def get_clusters_from_history() -> List[Dict[str, Any]]:
+def get_clusters_from_history(
+        days: Optional[int] = None) -> List[Dict[str, Any]]:
+    """Get cluster reports from history.
+    Args:
+        days: If specified, only include historical clusters (those not
+              currently active) that were last used within the past 'days'
+              days. Active clusters are always included regardless of this
+              parameter.
+    Returns:
+        List of cluster records with history information.
+    """
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
-        rows = session.query(
-            cluster_history_table.join(cluster_table,
-                                       cluster_history_table.c.cluster_hash ==
-                                       cluster_table.c.cluster_hash,
-                                       isouter=True)).all()
-    # '(cluster_hash, name, num_nodes, requested_resources, '
-    #         'launched_resources, usage_intervals) '
+        # Explicitly select columns from both tables to avoid ambiguity
+        query = session.query(
+            cluster_history_table.c.cluster_hash, cluster_history_table.c.name,
+            cluster_history_table.c.num_nodes,
+            cluster_history_table.c.requested_resources,
+            cluster_history_table.c.launched_resources,
+            cluster_history_table.c.usage_intervals,
+            cluster_history_table.c.user_hash,
+            cluster_history_table.c.last_creation_yaml,
+            cluster_history_table.c.last_creation_command,
+            cluster_table.c.status, cluster_table.c.workspace,
+            cluster_table.c.status_updated_at).select_from(
+                cluster_history_table.join(cluster_table,
+                                           cluster_history_table.c.cluster_hash
+                                           == cluster_table.c.cluster_hash,
+                                           isouter=True))
+        rows = query.all()
+    # Prepare filtering parameters
+    cutoff_time = None
+    if days is not None:
+        cutoff_time = int(time.time()) - (days * 24 * 60 * 60)
     records = []
     for row in rows:
-        # TODO: use namedtuple instead of dict
         user_hash = _get_user_hash_or_current_user(row.user_hash)
-        status = row.status
-        if status is not None:
-            status = status_lib.ClusterStatus[status]
+        launched_at = _get_cluster_launch_time(row.cluster_hash)
+        duration = _get_cluster_duration(row.cluster_hash)
+        # Parse status
+        status = None
+        if row.status:
+            status = status_lib.ClusterStatus[row.status]
+        # Apply filtering: always include active clusters, filter historical
+        # ones by time
+        if cutoff_time is not None and status is None:  # Historical cluster
+            # For historical clusters, check if they were used recently
+            # Use the most recent activity from usage_intervals to determine
+            # last use
+            usage_intervals = []
+            if row.usage_intervals:
+                try:
+                    usage_intervals = pickle.loads(row.usage_intervals)
+                except (pickle.PickleError, AttributeError):
+                    usage_intervals = []
+            # Find the most recent activity time from usage_intervals
+            last_activity_time = None
+            if usage_intervals:
+                # Get the end time of the last interval (or start time if
+                # still running)
+                last_interval = usage_intervals[-1]
+                last_activity_time = (last_interval[1] if last_interval[1]
+                                      is not None else last_interval[0])
+            # Skip historical clusters that haven't been used recently
+            if last_activity_time is None or last_activity_time < cutoff_time:
+                continue
+        # Parse launched resources safely
+        launched_resources = None
+        if row.launched_resources:
+            try:
+                launched_resources = pickle.loads(row.launched_resources)
+            except (pickle.PickleError, AttributeError):
+                launched_resources = None
+        # Parse usage intervals safely
+        usage_intervals = []
+        if row.usage_intervals:
+            try:
+                usage_intervals = pickle.loads(row.usage_intervals)
+            except (pickle.PickleError, AttributeError):
+                usage_intervals = []
+        # Get user name from user hash
+        user = get_user(user_hash)
+        user_name = user.name if user is not None else None
         record = {
             'name': row.name,
-            'launched_at': _get_cluster_launch_time(row.cluster_hash),
-            'duration': _get_cluster_duration(row.cluster_hash),
+            'launched_at': launched_at,
+            'duration': duration,
             'num_nodes': row.num_nodes,
-            'resources': pickle.loads(row.launched_resources),
+            'resources': launched_resources,
             'cluster_hash': row.cluster_hash,
-            'usage_intervals': pickle.loads(row.usage_intervals),
+            'usage_intervals': usage_intervals,
             'status': status,
             'user_hash': user_hash,
+            'user_name': user_name,
+            'workspace': row.workspace,
+            'last_creation_yaml': row.last_creation_yaml,
+            'last_creation_command': row.last_creation_command,
         }
         records.append(record)
     # sort by launch time, descending in recency
-    records = sorted(records, key=lambda record: -record['launched_at'])
+    records = sorted(records, key=lambda record: -(record['launched_at'] or 0))
     return records

sky/jobs/client/sdk.py CHANGED Viewed

@@ -49,7 +49,6 @@ def launch(
         task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
             managed job.
         name: Name of the managed job.
-        priority: Priority of the managed job.
         _need_confirmation: (Internal only) Whether to show a confirmation
             prompt before launching the job.

sky/jobs/controller.py CHANGED Viewed

@@ -603,7 +603,11 @@ def _cleanup(job_id: int, dag_yaml: str):
         # mounts.
         for file_mount in (task.file_mounts or {}).values():
             try:
-                if not data_utils.is_cloud_store_url(file_mount):
+                # For consolidation mode, there is no two-hop file mounts
+                # and the file path here represents the real user data.
+                # We skip the cleanup for consolidation mode.
+                if (not data_utils.is_cloud_store_url(file_mount) and
+                        not managed_job_utils.is_consolidation_mode()):
                     path = os.path.expanduser(file_mount)
                     if os.path.isdir(path):
                         shutil.rmtree(path)

sky/jobs/scheduler.py CHANGED Viewed

@@ -40,6 +40,7 @@ from argparse import ArgumentParser
 import contextlib
 from functools import lru_cache
 import os
+import sys
 import time
 import typing
@@ -89,12 +90,12 @@ def _start_controller(job_id: int, dag_yaml_path: str,
     activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
     source_environment_cmd = (f'source {env_file_path};'
                               if env_file_path else '')
-    run_controller_cmd = ('python -u -m sky.jobs.controller '
+    run_controller_cmd = (f'{sys.executable} -u -m sky.jobs.controller '
                           f'{dag_yaml_path} --job-id {job_id};')
     # If the command line here is changed, please also update
-    # utils._controller_process_alive. `--job-id X` should be at
-    # the end.
+    # utils._controller_process_alive. The substring `--job-id X`
+    # should be in the command.
     run_cmd = (f'{activate_python_env_cmd}'
                f'{source_environment_cmd}'
                f'{run_controller_cmd}')

sky/jobs/server/core.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """SDK functions for managed jobs."""
 import os
+import pathlib
 import tempfile
 import typing
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -20,6 +21,7 @@ from sky.backends import backend_utils
 from sky.catalog import common as service_catalog_common
 from sky.data import storage as storage_lib
 from sky.jobs import constants as managed_job_constants
+from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
 from sky.provision import common as provision_common
 from sky.skylet import constants as skylet_constants
@@ -43,6 +45,72 @@ if typing.TYPE_CHECKING:
 logger = sky_logging.init_logger(__name__)
+def _maybe_upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
+    """Maybe upload files to the controller.
+    In consolidation mode, we don't need to upload files to the controller as
+    the API server and the controller are colocated.
+    """
+    local_to_controller_file_mounts: Dict[str, str] = {}
+    if managed_job_utils.is_consolidation_mode():
+        return local_to_controller_file_mounts
+    if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
+        for task_ in dag.tasks:
+            controller_utils.maybe_translate_local_file_mounts_and_sync_up(
+                task_, task_type='jobs')
+    else:
+        # We do not have any cloud storage available, so fall back to
+        # two-hop file_mount uploading.
+        # Note: we can't easily hack sync_storage_mounts() to upload
+        # directly to the controller, because the controller may not
+        # even be up yet.
+        for task_ in dag.tasks:
+            if task_.storage_mounts:
+                # Technically, we could convert COPY storage_mounts that
+                # have a local source and do not specify `store`, but we
+                # will not do that for now. Only plain file_mounts are
+                # supported.
+                raise exceptions.NotSupportedError(
+                    'Cloud-based file_mounts are specified, but no cloud '
+                    'storage is available. Please specify local '
+                    'file_mounts only.')
+            # Merge file mounts from all tasks.
+            local_to_controller_file_mounts.update(
+                controller_utils.translate_local_file_mounts_to_two_hop(task_))
+    return local_to_controller_file_mounts
+def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag') -> Optional[int]:
+    """Submit the managed job locally if in consolidation mode.
+    In normal mode the managed job submission is done in the ray job submission.
+    For consolidation mode, we need to manually submit it. Check the following
+    function for the normal mode submission:
+    sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
+    _exec_code_on_head::_maybe_add_managed_job_code
+    """
+    if not managed_job_utils.is_consolidation_mode():
+        return None
+    # Create local directory for the managed job.
+    pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
+    consolidation_mode_job_id = managed_job_state.set_job_info_without_job_id(
+        dag.name,
+        workspace=skypilot_config.get_active_workspace(
+            force_user_workspace=True),
+        entrypoint=common_utils.get_current_command())
+    for task_id, task in enumerate(dag.tasks):
+        resources_str = backend_utils.get_task_resources_str(
+            task, is_managed_job=True)
+        managed_job_state.set_pending(consolidation_mode_job_id, task_id,
+                                      task.name, resources_str)
+    return consolidation_mode_job_id
 @timeline.event
 @usage_lib.entrypoint
 def launch(
@@ -103,7 +171,7 @@ def launch(
                     'will be auto-generated) .')
         task_names.add(task_.name)
-        # Check for priority in resources first, then fall back to job priority
+        # Check for priority in resources
         task_priority = None
         if task_.resources:
             # Convert set to list to access elements by index
@@ -121,20 +189,6 @@ def launch(
                             f'{resource.priority} but expected {task_priority}.'
                         )
-            # Check for conflict between resources priority and job
-            # priority
-            if task_.job_priority is not None:
-                with ux_utils.print_exception_no_traceback():
-                    raise ValueError(
-                        f'Task {task_.name!r}: Cannot specify both '
-                        f'resources.priority ({task_priority}) and '
-                        f'job.priority ({task_.job_priority}). Please use only '
-                        'one priority specification method.')
-        # Fall back to job priority if no resources priority found
-        if task_priority is None:
-            task_priority = task_.job_priority
         if task_priority is not None:
             if (priority is not None and priority != task_priority):
                 with ux_utils.print_exception_no_traceback():
@@ -183,34 +237,7 @@ def launch(
                         f'with:\n\n`sky down {cluster_name} --purge`\n\n'
                         f'Reason: {common_utils.format_exception(e)}')
-        local_to_controller_file_mounts = {}
-        if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
-            for task_ in dag.tasks:
-                controller_utils.maybe_translate_local_file_mounts_and_sync_up(
-                    task_, task_type='jobs')
-        else:
-            # We do not have any cloud storage available, so fall back to
-            # two-hop file_mount uploading.
-            # Note: we can't easily hack sync_storage_mounts() to upload
-            # directly to the controller, because the controller may not
-            # even be up yet.
-            for task_ in dag.tasks:
-                if task_.storage_mounts:
-                    # Technically, we could convert COPY storage_mounts that
-                    # have a local source and do not specify `store`, but we
-                    # will not do that for now. Only plain file_mounts are
-                    # supported.
-                    raise exceptions.NotSupportedError(
-                        'Cloud-based file_mounts are specified, but no cloud '
-                        'storage is available. Please specify local '
-                        'file_mounts only.')
-                # Merge file mounts from all tasks.
-                local_to_controller_file_mounts.update(
-                    controller_utils.translate_local_file_mounts_to_two_hop(
-                        task_))
+    local_to_controller_file_mounts = _maybe_upload_files_to_controller(dag)
     # Has to use `\` to avoid yapf issue.
     with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
@@ -233,6 +260,13 @@ def launch(
             controller=controller,
             task_resources=sum([list(t.resources) for t in dag.tasks], []))
+        consolidation_mode_job_id = _maybe_submit_job_locally(prefix, dag)
+        # This is only needed for non-consolidation mode. For consolidation
+        # mode, the controller uses the same catalog as API server.
+        modified_catalogs = {} if consolidation_mode_job_id is not None else (
+            service_catalog_common.get_modified_catalog_file_mounts())
         vars_to_fill = {
             'remote_original_user_yaml_path': remote_original_user_yaml_path,
             'original_user_dag_path': original_user_yaml_path.name,
@@ -244,9 +278,9 @@ def launch(
             'dag_name': dag.name,
             'remote_user_config_path': remote_user_config_path,
             'remote_env_file_path': remote_env_file_path,
-            'modified_catalogs':
-                service_catalog_common.get_modified_catalog_file_mounts(),
+            'modified_catalogs': modified_catalogs,
             'priority': priority,
+            'consolidation_mode_job_id': consolidation_mode_job_id,
             **controller_utils.shared_controller_vars_to_fill(
                 controller,
                 remote_user_config_path=remote_user_config_path,
@@ -285,12 +319,44 @@ def launch(
                 # workspace A, but the controller is in workspace B, the
                 # intermediate bucket and newly created bucket should be in
                 # workspace A.
-                return execution.launch(task=controller_task,
-                                        cluster_name=controller_name,
-                                        stream_logs=stream_logs,
-                                        retry_until_up=True,
-                                        fast=True,
-                                        _disable_controller_check=True)
+                if consolidation_mode_job_id is None:
+                    return execution.launch(task=controller_task,
+                                            cluster_name=controller_name,
+                                            stream_logs=stream_logs,
+                                            retry_until_up=True,
+                                            fast=True,
+                                            _disable_controller_check=True)
+                # Manually launch the scheduler process in consolidation mode.
+                local_handle = backend_utils.is_controller_accessible(
+                    controller=controller, stopped_message='')
+                backend = backend_utils.get_backend_from_handle(local_handle)
+                assert isinstance(backend, backends.CloudVmRayBackend)
+                backend.sync_file_mounts(
+                    handle=local_handle,
+                    all_file_mounts=controller_task.file_mounts,
+                    storage_mounts=controller_task.storage_mounts)
+                run_script = controller_task.run
+                assert isinstance(run_script, str)
+                # Manually add the env variables to the run script. Originally
+                # this is done in ray jobs submission but now we have to do it
+                # manually because there is no ray runtime on the API server.
+                env_cmds = [
+                    f'export {k}={v!r}'
+                    for k, v in controller_task.envs.items()
+                ]
+                run_script = '\n'.join(env_cmds + [run_script])
+                # Dump script for high availability recovery.
+                if controller_utils.high_availability_specified(
+                        controller_name):
+                    dump_script_path = (
+                        managed_job_utils.get_ha_dump_script_path(
+                            consolidation_mode_job_id))
+                    dump_script_path.parent.mkdir(parents=True, exist_ok=True)
+                    with open(dump_script_path, 'w',
+                              encoding='utf-8') as script_f:
+                        script_f.write(run_script)
+                backend.run_on_head(local_handle, run_script)
+                return consolidation_mode_job_id, local_handle
 def queue_from_kubernetes_pod(

sky/jobs/state.py CHANGED Viewed

@@ -463,6 +463,21 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str):
              entrypoint))
+@_init_db
+def set_job_info_without_job_id(name: str, workspace: str,
+                                entrypoint: str) -> int:
+    assert _DB_PATH is not None
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        cursor.execute(
+            """\
+            INSERT INTO job_info
+            (name, schedule_state, workspace, entrypoint)
+            VALUES (?, ?, ?, ?)""",
+            (name, ManagedJobScheduleState.INACTIVE.value, workspace,
+             entrypoint))
+        return cursor.lastrowid
 @_init_db
 def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
     """Set the task to pending state."""

skypilot-nightly 1.0.0.dev20250623__py3-none-any.whl → 1.0.0.dev20250624__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250623py3-none-any.whl → 1.0.0.dev20250624py3-none-any.whl