PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

sky/__init__.py +64 -32
sky/adaptors/aws.py +23 -6
sky/adaptors/azure.py +432 -15
sky/adaptors/cloudflare.py +5 -5
sky/adaptors/common.py +19 -9
sky/adaptors/do.py +20 -0
sky/adaptors/gcp.py +3 -2
sky/adaptors/kubernetes.py +122 -88
sky/adaptors/nebius.py +100 -0
sky/adaptors/oci.py +39 -1
sky/adaptors/vast.py +29 -0
sky/admin_policy.py +101 -0
sky/authentication.py +117 -98
sky/backends/backend.py +52 -20
sky/backends/backend_utils.py +669 -557
sky/backends/cloud_vm_ray_backend.py +1099 -808
sky/backends/local_docker_backend.py +14 -8
sky/backends/wheel_utils.py +38 -20
sky/benchmark/benchmark_utils.py +22 -23
sky/check.py +76 -27
sky/cli.py +1586 -1139
sky/client/__init__.py +1 -0
sky/client/cli.py +5683 -0
sky/client/common.py +345 -0
sky/client/sdk.py +1765 -0
sky/cloud_stores.py +283 -19
sky/clouds/__init__.py +7 -2
sky/clouds/aws.py +303 -112
sky/clouds/azure.py +185 -179
sky/clouds/cloud.py +115 -37
sky/clouds/cudo.py +29 -22
sky/clouds/do.py +313 -0
sky/clouds/fluidstack.py +44 -54
sky/clouds/gcp.py +206 -65
sky/clouds/ibm.py +26 -21
sky/clouds/kubernetes.py +345 -91
sky/clouds/lambda_cloud.py +40 -29
sky/clouds/nebius.py +297 -0
sky/clouds/oci.py +129 -90
sky/clouds/paperspace.py +22 -18
sky/clouds/runpod.py +53 -34
sky/clouds/scp.py +28 -24
sky/clouds/service_catalog/__init__.py +19 -13
sky/clouds/service_catalog/aws_catalog.py +29 -12
sky/clouds/service_catalog/azure_catalog.py +33 -6
sky/clouds/service_catalog/common.py +95 -75
sky/clouds/service_catalog/constants.py +3 -3
sky/clouds/service_catalog/cudo_catalog.py +13 -3
sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
sky/clouds/service_catalog/gcp_catalog.py +16 -2
sky/clouds/service_catalog/ibm_catalog.py +2 -2
sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
sky/clouds/service_catalog/lambda_catalog.py +8 -3
sky/clouds/service_catalog/nebius_catalog.py +116 -0
sky/clouds/service_catalog/oci_catalog.py +31 -4
sky/clouds/service_catalog/paperspace_catalog.py +2 -2
sky/clouds/service_catalog/runpod_catalog.py +2 -2
sky/clouds/service_catalog/scp_catalog.py +2 -2
sky/clouds/service_catalog/vast_catalog.py +104 -0
sky/clouds/service_catalog/vsphere_catalog.py +2 -2
sky/clouds/utils/aws_utils.py +65 -0
sky/clouds/utils/azure_utils.py +91 -0
sky/clouds/utils/gcp_utils.py +5 -9
sky/clouds/utils/oci_utils.py +47 -5
sky/clouds/utils/scp_utils.py +4 -3
sky/clouds/vast.py +280 -0
sky/clouds/vsphere.py +22 -18
sky/core.py +361 -107
sky/dag.py +41 -28
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +211 -32
sky/data/mounting_utils.py +182 -30
sky/data/storage.py +2118 -270
sky/data/storage_utils.py +126 -5
sky/exceptions.py +179 -8
sky/execution.py +158 -85
sky/global_user_state.py +150 -34
sky/jobs/__init__.py +12 -10
sky/jobs/client/__init__.py +0 -0
sky/jobs/client/sdk.py +302 -0
sky/jobs/constants.py +49 -11
sky/jobs/controller.py +161 -99
sky/jobs/dashboard/dashboard.py +171 -25
sky/jobs/dashboard/templates/index.html +572 -60
sky/jobs/recovery_strategy.py +157 -156
sky/jobs/scheduler.py +307 -0
sky/jobs/server/__init__.py +1 -0
sky/jobs/server/core.py +598 -0
sky/jobs/server/dashboard_utils.py +69 -0
sky/jobs/server/server.py +190 -0
sky/jobs/state.py +627 -122
sky/jobs/utils.py +615 -206
sky/models.py +27 -0
sky/optimizer.py +142 -83
sky/provision/__init__.py +20 -5
sky/provision/aws/config.py +124 -42
sky/provision/aws/instance.py +130 -53
sky/provision/azure/__init__.py +7 -0
sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
sky/provision/azure/config.py +220 -0
sky/provision/azure/instance.py +1012 -37
sky/provision/common.py +31 -3
sky/provision/constants.py +25 -0
sky/provision/cudo/__init__.py +2 -1
sky/provision/cudo/cudo_utils.py +112 -0
sky/provision/cudo/cudo_wrapper.py +37 -16
sky/provision/cudo/instance.py +28 -12
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +301 -0
sky/provision/docker_utils.py +82 -46
sky/provision/fluidstack/fluidstack_utils.py +57 -125
sky/provision/fluidstack/instance.py +15 -43
sky/provision/gcp/config.py +19 -9
sky/provision/gcp/constants.py +7 -1
sky/provision/gcp/instance.py +55 -34
sky/provision/gcp/instance_utils.py +339 -80
sky/provision/gcp/mig_utils.py +210 -0
sky/provision/instance_setup.py +172 -133
sky/provision/kubernetes/__init__.py +1 -0
sky/provision/kubernetes/config.py +104 -90
sky/provision/kubernetes/constants.py +8 -0
sky/provision/kubernetes/instance.py +680 -325
sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
sky/provision/kubernetes/network.py +54 -20
sky/provision/kubernetes/network_utils.py +70 -21
sky/provision/kubernetes/utils.py +1370 -251
sky/provision/lambda_cloud/__init__.py +11 -0
sky/provision/lambda_cloud/config.py +10 -0
sky/provision/lambda_cloud/instance.py +265 -0
sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
sky/provision/logging.py +1 -1
sky/provision/nebius/__init__.py +11 -0
sky/provision/nebius/config.py +11 -0
sky/provision/nebius/instance.py +285 -0
sky/provision/nebius/utils.py +318 -0
sky/provision/oci/__init__.py +15 -0
sky/provision/oci/config.py +51 -0
sky/provision/oci/instance.py +436 -0
sky/provision/oci/query_utils.py +681 -0
sky/provision/paperspace/constants.py +6 -0
sky/provision/paperspace/instance.py +4 -3
sky/provision/paperspace/utils.py +2 -0
sky/provision/provisioner.py +207 -130
sky/provision/runpod/__init__.py +1 -0
sky/provision/runpod/api/__init__.py +3 -0
sky/provision/runpod/api/commands.py +119 -0
sky/provision/runpod/api/pods.py +142 -0
sky/provision/runpod/instance.py +64 -8
sky/provision/runpod/utils.py +239 -23
sky/provision/vast/__init__.py +10 -0
sky/provision/vast/config.py +11 -0
sky/provision/vast/instance.py +247 -0
sky/provision/vast/utils.py +162 -0
sky/provision/vsphere/common/vim_utils.py +1 -1
sky/provision/vsphere/instance.py +8 -18
sky/provision/vsphere/vsphere_utils.py +1 -1
sky/resources.py +247 -102
sky/serve/__init__.py +9 -9
sky/serve/autoscalers.py +361 -299
sky/serve/client/__init__.py +0 -0
sky/serve/client/sdk.py +366 -0
sky/serve/constants.py +12 -3
sky/serve/controller.py +106 -36
sky/serve/load_balancer.py +63 -12
sky/serve/load_balancing_policies.py +84 -2
sky/serve/replica_managers.py +42 -34
sky/serve/serve_state.py +62 -32
sky/serve/serve_utils.py +271 -160
sky/serve/server/__init__.py +0 -0
sky/serve/{core.py → server/core.py} +271 -90
sky/serve/server/server.py +112 -0
sky/serve/service.py +52 -16
sky/serve/service_spec.py +95 -32
sky/server/__init__.py +1 -0
sky/server/common.py +430 -0
sky/server/constants.py +21 -0
sky/server/html/log.html +174 -0
sky/server/requests/__init__.py +0 -0
sky/server/requests/executor.py +472 -0
sky/server/requests/payloads.py +487 -0
sky/server/requests/queues/__init__.py +0 -0
sky/server/requests/queues/mp_queue.py +76 -0
sky/server/requests/requests.py +567 -0
sky/server/requests/serializers/__init__.py +0 -0
sky/server/requests/serializers/decoders.py +192 -0
sky/server/requests/serializers/encoders.py +166 -0
sky/server/server.py +1106 -0
sky/server/stream_utils.py +141 -0
sky/setup_files/MANIFEST.in +2 -5
sky/setup_files/dependencies.py +159 -0
sky/setup_files/setup.py +14 -125
sky/sky_logging.py +59 -14
sky/skylet/autostop_lib.py +2 -2
sky/skylet/constants.py +183 -50
sky/skylet/events.py +22 -10
sky/skylet/job_lib.py +403 -258
sky/skylet/log_lib.py +111 -71
sky/skylet/log_lib.pyi +6 -0
sky/skylet/providers/command_runner.py +6 -8
sky/skylet/providers/ibm/node_provider.py +2 -2
sky/skylet/providers/scp/config.py +11 -3
sky/skylet/providers/scp/node_provider.py +8 -8
sky/skylet/skylet.py +3 -1
sky/skylet/subprocess_daemon.py +69 -17
sky/skypilot_config.py +119 -57
sky/task.py +205 -64
sky/templates/aws-ray.yml.j2 +37 -7
sky/templates/azure-ray.yml.j2 +27 -82
sky/templates/cudo-ray.yml.j2 +7 -3
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/fluidstack-ray.yml.j2 +7 -4
sky/templates/gcp-ray.yml.j2 +26 -6
sky/templates/ibm-ray.yml.j2 +3 -2
sky/templates/jobs-controller.yaml.j2 +46 -11
sky/templates/kubernetes-ingress.yml.j2 +7 -0
sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
sky/templates/kubernetes-ray.yml.j2 +292 -25
sky/templates/lambda-ray.yml.j2 +30 -40
sky/templates/nebius-ray.yml.j2 +79 -0
sky/templates/oci-ray.yml.j2 +18 -57
sky/templates/paperspace-ray.yml.j2 +10 -6
sky/templates/runpod-ray.yml.j2 +26 -4
sky/templates/scp-ray.yml.j2 +3 -2
sky/templates/sky-serve-controller.yaml.j2 +12 -1
sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
sky/templates/vast-ray.yml.j2 +70 -0
sky/templates/vsphere-ray.yml.j2 +8 -3
sky/templates/websocket_proxy.py +64 -0
sky/usage/constants.py +10 -1
sky/usage/usage_lib.py +130 -37
sky/utils/accelerator_registry.py +35 -51
sky/utils/admin_policy_utils.py +147 -0
sky/utils/annotations.py +51 -0
sky/utils/cli_utils/status_utils.py +81 -23
sky/utils/cluster_utils.py +356 -0
sky/utils/command_runner.py +452 -89
sky/utils/command_runner.pyi +77 -3
sky/utils/common.py +54 -0
sky/utils/common_utils.py +319 -108
sky/utils/config_utils.py +204 -0
sky/utils/control_master_utils.py +48 -0
sky/utils/controller_utils.py +548 -266
sky/utils/dag_utils.py +93 -32
sky/utils/db_utils.py +18 -4
sky/utils/env_options.py +29 -7
sky/utils/kubernetes/create_cluster.sh +8 -60
sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
sky/utils/kubernetes/gpu_labeler.py +4 -4
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
sky/utils/kubernetes/rsync_helper.sh +24 -0
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
sky/utils/log_utils.py +240 -33
sky/utils/message_utils.py +81 -0
sky/utils/registry.py +127 -0
sky/utils/resources_utils.py +94 -22
sky/utils/rich_utils.py +247 -18
sky/utils/schemas.py +284 -64
sky/{status_lib.py → utils/status_lib.py} +12 -7
sky/utils/subprocess_utils.py +212 -46
sky/utils/timeline.py +12 -7
sky/utils/ux_utils.py +168 -15
skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
sky/clouds/cloud_registry.py +0 -31
sky/jobs/core.py +0 -330
sky/skylet/providers/azure/__init__.py +0 -2
sky/skylet/providers/azure/azure-vm-template.json +0 -301
sky/skylet/providers/azure/config.py +0 -170
sky/skylet/providers/azure/node_provider.py +0 -466
sky/skylet/providers/lambda_cloud/__init__.py +0 -2
sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
sky/skylet/providers/oci/__init__.py +0 -2
sky/skylet/providers/oci/node_provider.py +0 -488
sky/skylet/providers/oci/query_helper.py +0 -383
sky/skylet/providers/oci/utils.py +0 -21
sky/utils/cluster_yaml_utils.py +0 -24
sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0

sky/jobs/state.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # TODO(zhwu): maybe use file based status instead of database, so
 # that we can easily switch to a s3-based storage.
 import enum
+import json
 import pathlib
 import sqlite3
 import time
@@ -10,7 +11,9 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import colorama
+from sky import exceptions
 from sky import sky_logging
+from sky.utils import common_utils
 from sky.utils import db_utils
 if typing.TYPE_CHECKING:
@@ -20,15 +23,6 @@ CallbackType = Callable[[str], None]
 logger = sky_logging.init_logger(__name__)
-_DB_PATH = pathlib.Path('~/.sky/spot_jobs.db')
-_DB_PATH = _DB_PATH.expanduser().absolute()
-_DB_PATH.parents[0].mkdir(parents=True, exist_ok=True)
-_DB_PATH = str(_DB_PATH)
-# Module-level connection/cursor; thread-safe as the module is only imported
-# once.
-_CONN = sqlite3.connect(_DB_PATH)
-_CURSOR = _CONN.cursor()
 # === Database schema ===
 # `spot` table contains all the finest-grained tasks, including all the
@@ -39,58 +33,124 @@ _CURSOR = _CONN.cursor()
 # the same content as the `task_name` column.
 # The `job_id` is now not really a job id, but a only a unique
 # identifier/primary key for all the tasks. We will use `spot_job_id`
-# to identify the spot job.
+# to identify the job.
 # TODO(zhwu): schema migration may be needed.
-_CURSOR.execute("""\
-    CREATE TABLE IF NOT EXISTS spot (
-    job_id INTEGER PRIMARY KEY AUTOINCREMENT,
-    job_name TEXT,
-    resources TEXT,
-    submitted_at FLOAT,
-    status TEXT,
-    run_timestamp TEXT CANDIDATE KEY,
-    start_at FLOAT DEFAULT NULL,
-    end_at FLOAT DEFAULT NULL,
-    last_recovered_at FLOAT DEFAULT -1,
-    recovery_count INTEGER DEFAULT 0,
-    job_duration FLOAT DEFAULT 0,
-    failure_reason TEXT,
-    spot_job_id INTEGER,
-    task_id INTEGER DEFAULT 0,
-    task_name TEXT)""")
-_CONN.commit()
-db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
-# Create a new column `spot_job_id`, which is the same for tasks of the
-# same managed job.
-# The original `job_id` no longer has an actual meaning, but only a legacy
-# identifier for all tasks in database.
-db_utils.add_column_to_table(_CURSOR,
-                             _CONN,
-                             'spot',
-                             'spot_job_id',
-                             'INTEGER',
-                             copy_from='job_id')
-db_utils.add_column_to_table(_CURSOR,
-                             _CONN,
-                             'spot',
-                             'task_id',
-                             'INTEGER DEFAULT 0',
-                             value_to_replace_existing_entries=0)
-db_utils.add_column_to_table(_CURSOR,
-                             _CONN,
-                             'spot',
-                             'task_name',
-                             'TEXT',
-                             copy_from='job_name')
-# `job_info` contains the mapping from job_id to the job_name.
-# In the future, it may contain more information about each job.
-_CURSOR.execute("""\
-    CREATE TABLE IF NOT EXISTS job_info (
-    spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
-    name TEXT)""")
-_CONN.commit()
+def create_table(cursor, conn):
+    # Enable WAL mode to avoid locking issues.
+    # See: issue #3863, #1441 and PR #1509
+    # https://github.com/microsoft/WSL/issues/2395
+    # TODO(romilb): We do not enable WAL for WSL because of known issue in WSL.
+    #  This may cause the database locked problem from WSL issue #1441.
+    if not common_utils.is_wsl():
+        try:
+            cursor.execute('PRAGMA journal_mode=WAL')
+        except sqlite3.OperationalError as e:
+            if 'database is locked' not in str(e):
+                raise
+            # If the database is locked, it is OK to continue, as the WAL mode
+            # is not critical and is likely to be enabled by other processes.
+    cursor.execute("""\
+        CREATE TABLE IF NOT EXISTS spot (
+        job_id INTEGER PRIMARY KEY AUTOINCREMENT,
+        job_name TEXT,
+        resources TEXT,
+        submitted_at FLOAT,
+        status TEXT,
+        run_timestamp TEXT CANDIDATE KEY,
+        start_at FLOAT DEFAULT NULL,
+        end_at FLOAT DEFAULT NULL,
+        last_recovered_at FLOAT DEFAULT -1,
+        recovery_count INTEGER DEFAULT 0,
+        job_duration FLOAT DEFAULT 0,
+        failure_reason TEXT,
+        spot_job_id INTEGER,
+        task_id INTEGER DEFAULT 0,
+        task_name TEXT,
+        specs TEXT,
+        local_log_file TEXT DEFAULT NULL)""")
+    conn.commit()
+    db_utils.add_column_to_table(cursor, conn, 'spot', 'failure_reason', 'TEXT')
+    # Create a new column `spot_job_id`, which is the same for tasks of the
+    # same managed job.
+    # The original `job_id` no longer has an actual meaning, but only a legacy
+    # identifier for all tasks in database.
+    db_utils.add_column_to_table(cursor,
+                                 conn,
+                                 'spot',
+                                 'spot_job_id',
+                                 'INTEGER',
+                                 copy_from='job_id')
+    db_utils.add_column_to_table(cursor,
+                                 conn,
+                                 'spot',
+                                 'task_id',
+                                 'INTEGER DEFAULT 0',
+                                 value_to_replace_existing_entries=0)
+    db_utils.add_column_to_table(cursor,
+                                 conn,
+                                 'spot',
+                                 'task_name',
+                                 'TEXT',
+                                 copy_from='job_name')
+    # Specs is some useful information about the task, e.g., the
+    # max_restarts_on_errors value. It is stored in JSON format.
+    db_utils.add_column_to_table(cursor,
+                                 conn,
+                                 'spot',
+                                 'specs',
+                                 'TEXT',
+                                 value_to_replace_existing_entries=json.dumps({
+                                     'max_restarts_on_errors': 0,
+                                 }))
+    db_utils.add_column_to_table(cursor, conn, 'spot', 'local_log_file',
+                                 'TEXT DEFAULT NULL')
+    # `job_info` contains the mapping from job_id to the job_name, as well as
+    # information used by the scheduler.
+    cursor.execute("""\
+        CREATE TABLE IF NOT EXISTS job_info (
+        spot_job_id INTEGER PRIMARY KEY AUTOINCREMENT,
+        name TEXT,
+        schedule_state TEXT,
+        controller_pid INTEGER DEFAULT NULL,
+        dag_yaml_path TEXT,
+        env_file_path TEXT,
+        user_hash TEXT)""")
+    db_utils.add_column_to_table(cursor, conn, 'job_info', 'schedule_state',
+                                 'TEXT')
+    db_utils.add_column_to_table(cursor, conn, 'job_info', 'controller_pid',
+                                 'INTEGER DEFAULT NULL')
+    db_utils.add_column_to_table(cursor, conn, 'job_info', 'dag_yaml_path',
+                                 'TEXT')
+    db_utils.add_column_to_table(cursor, conn, 'job_info', 'env_file_path',
+                                 'TEXT')
+    db_utils.add_column_to_table(cursor, conn, 'job_info', 'user_hash', 'TEXT')
+    conn.commit()
+# Module-level connection/cursor; thread-safe as the module is only imported
+# once.
+def _get_db_path() -> str:
+    """Workaround to collapse multi-step Path ops for type checker.
+    Ensures _DB_PATH is str, avoiding Union[Path, str] inference.
+    """
+    path = pathlib.Path('~/.sky/spot_jobs.db')
+    path = path.expanduser().absolute()
+    path.parents[0].mkdir(parents=True, exist_ok=True)
+    return str(path)
+_DB_PATH = _get_db_path()
+db_utils.SQLiteConn(_DB_PATH, create_table)
 # job_duration is the time a job actually runs (including the
 # setup duration) before last_recover, excluding the provision
@@ -120,9 +180,16 @@ columns = [
     'job_id',
     'task_id',
     'task_name',
+    'specs',
+    'local_log_file',
     # columns from the job_info table
     '_job_info_job_id',  # This should be the same as job_id
-    'job_name'
+    'job_name',
+    'schedule_state',
+    'controller_pid',
+    'dag_yaml_path',
+    'env_file_path',
+    'user_hash',
 ]
@@ -148,16 +215,19 @@ class ManagedJobStatus(enum.Enum):
         SUCCEEDED       ->  SUCCEEDED
         FAILED          ->  FAILED
         FAILED_SETUP    ->  FAILED_SETUP
+    Not all statuses are in this list, since some ManagedJobStatuses are only
+    possible while the cluster is INIT/STOPPED/not yet UP.
     Note that the JobStatus will not be stuck in PENDING, because each cluster
     is dedicated to a managed job, i.e. there should always be enough resource
     to run the job and the job will be immediately transitioned to RUNNING.
+    You can see a state diagram for ManagedJobStatus in sky/jobs/README.md.
     """
     # PENDING: Waiting for the jobs controller to have a slot to run the
     # controller process.
-    # The submitted_at timestamp of the managed job in the 'spot' table will be
-    # set to the time when the job is firstly submitted by the user (set to
-    # PENDING).
     PENDING = 'PENDING'
+    # The submitted_at timestamp of the managed job in the 'spot' table will be
+    # set to the time when the job controller begins running.
     # SUBMITTED: The jobs controller starts the controller process.
     SUBMITTED = 'SUBMITTED'
     # STARTING: The controller process is launching the cluster for the managed
@@ -171,12 +241,12 @@ class ManagedJobStatus(enum.Enum):
     # RECOVERING: The cluster is preempted, and the controller process is
     # recovering the cluster (relaunching/failover).
     RECOVERING = 'RECOVERING'
-    # Terminal statuses
-    # SUCCEEDED: The job is finished successfully.
-    SUCCEEDED = 'SUCCEEDED'
     # CANCELLING: The job is requested to be cancelled by the user, and the
     # controller is cleaning up the cluster.
     CANCELLING = 'CANCELLING'
+    # Terminal statuses
+    # SUCCEEDED: The job is finished successfully.
+    SUCCEEDED = 'SUCCEEDED'
     # CANCELLED: The job is cancelled by the user. When the managed job is in
     # CANCELLED status, the cluster has been cleaned up.
     CANCELLED = 'CANCELLED'
@@ -222,7 +292,6 @@ class ManagedJobStatus(enum.Enum):
             cls.FAILED_PRECHECKS,
             cls.FAILED_NO_RESOURCE,
             cls.FAILED_CONTROLLER,
-            cls.CANCELLING,
             cls.CANCELLED,
         ]
@@ -251,14 +320,74 @@ _SPOT_STATUS_TO_COLOR = {
 }
+class ManagedJobScheduleState(enum.Enum):
+    """Captures the state of the job from the scheduler's perspective.
+    A job that predates the introduction of the scheduler will be INVALID.
+    A newly created job will be INACTIVE.  The following transitions are valid:
+    - INACTIVE -> WAITING: The job is "submitted" to the scheduler, and its job
+      controller can be started.
+    - WAITING -> LAUNCHING: The job controller is starting by the scheduler and
+      may proceed to sky.launch.
+    - LAUNCHING -> ALIVE: The launch attempt was completed. It may have
+      succeeded or failed. The job controller is not allowed to sky.launch again
+      without transitioning to ALIVE_WAITING and then LAUNCHING.
+    - ALIVE -> ALIVE_WAITING: The job controller wants to sky.launch again,
+      either for recovery or to launch a subsequent task.
+    - ALIVE_WAITING -> LAUNCHING: The scheduler has determined that the job
+      controller may launch again.
+    - LAUNCHING, ALIVE, or ALIVE_WAITING -> DONE: The job controller is exiting
+      and the job is in some terminal status. In the future it may be possible
+      to transition directly from WAITING or even INACTIVE to DONE if the job is
+      cancelled.
+    You can see a state diagram in sky/jobs/README.md.
+    There is no well-defined mapping from the managed job status to schedule
+    state or vice versa. (In fact, schedule state is defined on the job and
+    status on the task.)
+    - INACTIVE or WAITING should only be seen when a job is PENDING.
+    - ALIVE_WAITING should only be seen when a job is RECOVERING, has multiple
+      tasks, or needs to retry launching.
+    - LAUNCHING and ALIVE can be seen in many different statuses.
+    - DONE should only be seen when a job is in a terminal status.
+    Since state and status transitions are not atomic, it may be possible to
+    briefly observe inconsistent states, like a job that just finished but
+    hasn't yet transitioned to DONE.
+    """
+    # This job may have been created before scheduler was introduced in #4458.
+    # This state is not used by scheduler but just for backward compatibility.
+    # TODO(cooperc): remove this in v0.11.0
+    INVALID = None
+    # The job should be ignored by the scheduler.
+    INACTIVE = 'INACTIVE'
+    # The job is waiting to transition to LAUNCHING for the first time. The
+    # scheduler should try to transition it, and when it does, it should start
+    # the job controller.
+    WAITING = 'WAITING'
+    # The job is already alive, but wants to transition back to LAUNCHING,
+    # e.g. for recovery, or launching later tasks in the DAG. The scheduler
+    # should try to transition it to LAUNCHING.
+    ALIVE_WAITING = 'ALIVE_WAITING'
+    # The job is running sky.launch, or soon will, using a limited number of
+    # allowed launch slots.
+    LAUNCHING = 'LAUNCHING'
+    # The controller for the job is running, but it's not currently launching.
+    ALIVE = 'ALIVE'
+    # The job is in a terminal state. (Not necessarily SUCCEEDED.)
+    DONE = 'DONE'
 # === Status transition functions ===
-def set_job_name(job_id: int, name: str):
+def set_job_info(job_id: int, name: str):
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
             """\
             INSERT INTO job_info
-            (spot_job_id, name)
-            VALUES (?, ?)""", (job_id, name))
+            (spot_job_id, name, schedule_state)
+            VALUES (?, ?, ?)""",
+            (job_id, name, ManagedJobScheduleState.INACTIVE.value))
 def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
@@ -275,16 +404,19 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
 def set_submitted(job_id: int, task_id: int, run_timestamp: str,
                   submit_time: float, resources_str: str,
-                  callback_func: CallbackType):
+                  specs: Dict[str, Union[str,
+                                         int]], callback_func: CallbackType):
     """Set the task to submitted.
     Args:
         job_id: The managed job ID.
         task_id: The task ID.
         run_timestamp: The run_timestamp of the run. This will be used to
-            determine the log directory of the managed task.
+        determine the log directory of the managed task.
         submit_time: The time when the managed task is submitted.
         resources_str: The resources string of the managed task.
+        specs: The specs of the managed task.
+        callback_func: The callback function.
     """
     # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
     # the log directory and submission time align with each other, so as to
@@ -298,11 +430,19 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
             resources=(?),
             submitted_at=(?),
             status=(?),
-            run_timestamp=(?)
+            run_timestamp=(?),
+            specs=(?)
             WHERE spot_job_id=(?) AND
-            task_id=(?)""",
+            task_id=(?) AND
+            status=(?) AND
+            end_at IS null""",
             (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
-             run_timestamp, job_id, task_id))
+             run_timestamp, json.dumps(specs), job_id, task_id,
+             ManagedJobStatus.PENDING.value))
+        if cursor.rowcount != 1:
+            raise exceptions.ManagedJobStatusError(
+                f'Failed to set the task to submitted. '
+                f'({cursor.rowcount} rows updated)')
     callback_func('SUBMITTED')
@@ -314,7 +454,14 @@ def set_starting(job_id: int, task_id: int, callback_func: CallbackType):
             """\
             UPDATE spot SET status=(?)
             WHERE spot_job_id=(?) AND
-            task_id=(?)""", (ManagedJobStatus.STARTING.value, job_id, task_id))
+            task_id=(?) AND
+            status=(?) AND
+            end_at IS null""", (ManagedJobStatus.STARTING.value, job_id,
+                                task_id, ManagedJobStatus.SUBMITTED.value))
+        if cursor.rowcount != 1:
+            raise exceptions.ManagedJobStatusError(
+                f'Failed to set the task to starting. '
+                f'({cursor.rowcount} rows updated)')
     callback_func('STARTING')
@@ -327,15 +474,25 @@ def set_started(job_id: int, task_id: int, start_time: float,
             """\
             UPDATE spot SET status=(?), start_at=(?), last_recovered_at=(?)
             WHERE spot_job_id=(?) AND
-            task_id=(?)""",
+            task_id=(?) AND
+            status IN (?, ?) AND
+            end_at IS null""",
             (
                 ManagedJobStatus.RUNNING.value,
                 start_time,
                 start_time,
                 job_id,
                 task_id,
+                ManagedJobStatus.STARTING.value,
+                # If the task is empty, we will jump straight from PENDING to
+                # RUNNING
+                ManagedJobStatus.PENDING.value,
             ),
         )
+        if cursor.rowcount != 1:
+            raise exceptions.ManagedJobStatusError(
+                f'Failed to set the task to started. '
+                f'({cursor.rowcount} rows updated)')
     callback_func('STARTED')
@@ -348,8 +505,15 @@ def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
                 UPDATE spot SET
                 status=(?), job_duration=job_duration+(?)-last_recovered_at
                 WHERE spot_job_id=(?) AND
-                task_id=(?)""",
-            (ManagedJobStatus.RECOVERING.value, time.time(), job_id, task_id))
+                task_id=(?) AND
+                status=(?) AND
+                end_at IS null""",
+            (ManagedJobStatus.RECOVERING.value, time.time(), job_id, task_id,
+             ManagedJobStatus.RUNNING.value))
+        if cursor.rowcount != 1:
+            raise exceptions.ManagedJobStatusError(
+                f'Failed to set the task to recovering. '
+                f'({cursor.rowcount} rows updated)')
     callback_func('RECOVERING')
@@ -362,8 +526,15 @@ def set_recovered(job_id: int, task_id: int, recovered_time: float,
             UPDATE spot SET
             status=(?), last_recovered_at=(?), recovery_count=recovery_count+1
             WHERE spot_job_id=(?) AND
-            task_id=(?)""",
-            (ManagedJobStatus.RUNNING.value, recovered_time, job_id, task_id))
+            task_id=(?) AND
+            status=(?) AND
+            end_at IS null""",
+            (ManagedJobStatus.RUNNING.value, recovered_time, job_id, task_id,
+             ManagedJobStatus.RECOVERING.value))
+        if cursor.rowcount != 1:
+            raise exceptions.ManagedJobStatusError(
+                f'Failed to set the task to recovered. '
+                f'({cursor.rowcount} rows updated)')
     logger.info('==== Recovered. ====')
     callback_func('RECOVERED')
@@ -376,10 +547,16 @@ def set_succeeded(job_id: int, task_id: int, end_time: float,
             """\
             UPDATE spot SET
             status=(?), end_at=(?)
-            WHERE spot_job_id=(?) AND task_id=(?)
-            AND end_at IS null""",
-            (ManagedJobStatus.SUCCEEDED.value, end_time, job_id, task_id))
+            WHERE spot_job_id=(?) AND
+            task_id=(?) AND
+            status=(?) AND
+            end_at IS null""",
+            (ManagedJobStatus.SUCCEEDED.value, end_time, job_id, task_id,
+             ManagedJobStatus.RUNNING.value))
+        if cursor.rowcount != 1:
+            raise exceptions.ManagedJobStatusError(
+                f'Failed to set the task to succeeded. '
+                f'({cursor.rowcount} rows updated)')
     callback_func('SUCCEEDED')
     logger.info('Job succeeded.')
@@ -391,8 +568,12 @@ def set_failed(
     failure_reason: str,
     callback_func: Optional[CallbackType] = None,
     end_time: Optional[float] = None,
+    override_terminal: bool = False,
 ):
-    """Set an entire job or task to failed, if they are in non-terminal states.
+    """Set an entire job or task to failed.
+    By default, don't override tasks that are already terminal (that is, for
+    which end_at is already set).
     Args:
         job_id: The job id.
@@ -401,36 +582,55 @@ def set_failed(
         failure_type: The failure type. One of ManagedJobStatus.FAILED_*.
         failure_reason: The failure reason.
         end_time: The end time. If None, the current time will be used.
+        override_terminal: If True, override the current status even if end_at
+            is already set.
     """
     assert failure_type.is_failed(), failure_type
     end_time = time.time() if end_time is None else end_time
-    fields_to_set = {
-        'end_at': end_time,
+    fields_to_set: Dict[str, Any] = {
         'status': failure_type.value,
         'failure_reason': failure_reason,
     }
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         previous_status = cursor.execute(
             'SELECT status FROM spot WHERE spot_job_id=(?)',
-            (job_id,)).fetchone()
-        previous_status = ManagedJobStatus(previous_status[0])
-        if previous_status in [ManagedJobStatus.RECOVERING]:
-            # If the job is recovering, we should set the
-            # last_recovered_at to the end_time, so that the
-            # end_at - last_recovered_at will not be affect the job duration
-            # calculation.
+            (job_id,)).fetchone()[0]
+        previous_status = ManagedJobStatus(previous_status)
+        if previous_status == ManagedJobStatus.RECOVERING:
+            # If the job is recovering, we should set the last_recovered_at to
+            # the end_time, so that the end_at - last_recovered_at will not be
+            # affect the job duration calculation.
             fields_to_set['last_recovered_at'] = end_time
         set_str = ', '.join(f'{k}=(?)' for k in fields_to_set)
-        task_str = '' if task_id is None else f' AND task_id={task_id}'
+        task_query_str = '' if task_id is None else 'AND task_id=(?)'
+        task_value = [] if task_id is None else [
+            task_id,
+        ]
-        cursor.execute(
-            f"""\
-            UPDATE spot SET
-            {set_str}
-            WHERE spot_job_id=(?){task_str} AND end_at IS null""",
-            (*list(fields_to_set.values()), job_id))
-    if callback_func:
+        if override_terminal:
+            # Use COALESCE for end_at to avoid overriding the existing end_at if
+            # it's already set.
+            cursor.execute(
+                f"""\
+                UPDATE spot SET
+                end_at = COALESCE(end_at, ?),
+                {set_str}
+                WHERE spot_job_id=(?) {task_query_str}""",
+                (end_time, *list(fields_to_set.values()), job_id, *task_value))
+        else:
+            # Only set if end_at is null, i.e. the previous status is not
+            # terminal.
+            cursor.execute(
+                f"""\
+                UPDATE spot SET
+                end_at = (?),
+                {set_str}
+                WHERE spot_job_id=(?) {task_query_str} AND end_at IS null""",
+                (end_time, *list(fields_to_set.values()), job_id, *task_value))
+        updated = cursor.rowcount > 0
+    if callback_func and updated:
         callback_func('FAILED')
     logger.info(failure_reason)
@@ -445,12 +645,15 @@ def set_cancelling(job_id: int, callback_func: CallbackType):
         rows = cursor.execute(
             """\
             UPDATE spot SET
-            status=(?), end_at=(?)
+            status=(?)
             WHERE spot_job_id=(?) AND end_at IS null""",
-            (ManagedJobStatus.CANCELLING.value, time.time(), job_id))
-        if rows.rowcount > 0:
-            logger.info('Cancelling the job...')
-            callback_func('CANCELLING')
+            (ManagedJobStatus.CANCELLING.value, job_id))
+        updated = rows.rowcount > 0
+    if updated:
+        logger.info('Cancelling the job...')
+        callback_func('CANCELLING')
+    else:
+        logger.info('Cancellation skipped, job is already terminal')
 def set_cancelled(job_id: int, callback_func: CallbackType):
@@ -466,26 +669,47 @@ def set_cancelled(job_id: int, callback_func: CallbackType):
             WHERE spot_job_id=(?) AND status=(?)""",
             (ManagedJobStatus.CANCELLED.value, time.time(), job_id,
              ManagedJobStatus.CANCELLING.value))
-        if rows.rowcount > 0:
-            logger.info('Job cancelled.')
-            callback_func('CANCELLED')
+        updated = rows.rowcount > 0
+    if updated:
+        logger.info('Job cancelled.')
+        callback_func('CANCELLED')
+    else:
+        logger.info('Cancellation skipped, job is not CANCELLING')
+def set_local_log_file(job_id: int, task_id: Optional[int],
+                       local_log_file: str):
+    """Set the local log file for a job."""
+    filter_str = 'spot_job_id=(?)'
+    filter_args = [local_log_file, job_id]
+    if task_id is not None:
+        filter_str += ' AND task_id=(?)'
+        filter_args.append(task_id)
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        cursor.execute(
+            'UPDATE spot SET local_log_file=(?) '
+            f'WHERE {filter_str}', filter_args)
 # ======== utility functions ========
-def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
+def get_nonterminal_job_ids_by_name(name: Optional[str],
+                                    all_users: bool = False) -> List[int]:
     """Get non-terminal job ids by name."""
     statuses = ', '.join(['?'] * len(ManagedJobStatus.terminal_statuses()))
     field_values = [
         status.value for status in ManagedJobStatus.terminal_statuses()
     ]
-    name_filter = ''
+    job_filter = ''
+    if name is None and not all_users:
+        job_filter += 'AND (job_info.user_hash=(?)) '
+        field_values.append(common_utils.get_user_hash())
     if name is not None:
         # We match the job name from `job_info` for the jobs submitted after
         # #1982, and from `spot` for the jobs submitted before #1982, whose
         # job_info is not available.
-        name_filter = ('AND (job_info.name=(?) OR '
-                       '(job_info.name IS NULL AND spot.task_name=(?)))')
+        job_filter += ('AND (job_info.name=(?) OR '
+                       '(job_info.name IS NULL AND spot.task_name=(?))) ')
         field_values.extend([name, name])
     # Left outer join is used here instead of join, because the job_info does
@@ -499,6 +723,127 @@ def get_nonterminal_job_ids_by_name(name: Optional[str]) -> List[int]:
             ON spot.spot_job_id=job_info.spot_job_id
             WHERE status NOT IN
             ({statuses})
+            {job_filter}
+            ORDER BY spot.spot_job_id DESC""", field_values).fetchall()
+        job_ids = [row[0] for row in rows if row[0] is not None]
+        return job_ids
+def get_schedule_live_jobs(job_id: Optional[int]) -> List[Dict[str, Any]]:
+    """Get jobs from the database that have a live schedule_state.
+    This should return job(s) that are not INACTIVE, WAITING, or DONE.  So a
+    returned job should correspond to a live job controller process, with one
+    exception: the job may have just transitioned from WAITING to LAUNCHING, but
+    the controller process has not yet started.
+    """
+    job_filter = '' if job_id is None else 'AND spot_job_id=(?)'
+    job_value = (job_id,) if job_id is not None else ()
+    # Join spot and job_info tables to get the job name for each task.
+    # We use LEFT OUTER JOIN mainly for backward compatibility, as for an
+    # existing controller before #1982, the job_info table may not exist,
+    # and all the managed jobs created before will not present in the
+    # job_info.
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        rows = cursor.execute(
+            f"""\
+            SELECT spot_job_id, schedule_state, controller_pid
+            FROM job_info
+            WHERE schedule_state not in (?, ?, ?)
+            {job_filter}
+            ORDER BY spot_job_id DESC""",
+            (ManagedJobScheduleState.INACTIVE.value,
+             ManagedJobScheduleState.WAITING.value,
+             ManagedJobScheduleState.DONE.value, *job_value)).fetchall()
+        jobs = []
+        for row in rows:
+            job_dict = {
+                'job_id': row[0],
+                'schedule_state': ManagedJobScheduleState(row[1]),
+                'controller_pid': row[2],
+            }
+            jobs.append(job_dict)
+        return jobs
+def get_jobs_to_check_status(job_id: Optional[int] = None) -> List[int]:
+    """Get jobs that need controller process checking.
+    Args:
+        job_id: Optional job ID to check. If None, checks all jobs.
+    Returns a list of job_ids, including the following:
+    - Jobs that have a schedule_state that is not DONE
+    - Jobs have schedule_state DONE but are in a non-terminal status
+    - Legacy jobs (that is, no schedule state) that are in non-terminal status
+    """
+    job_filter = '' if job_id is None else 'AND spot.spot_job_id=(?)'
+    job_value = () if job_id is None else (job_id,)
+    status_filter_str = ', '.join(['?'] *
+                                  len(ManagedJobStatus.terminal_statuses()))
+    terminal_status_values = [
+        status.value for status in ManagedJobStatus.terminal_statuses()
+    ]
+    # Get jobs that are either:
+    # 1. Have schedule state that is not DONE, or
+    # 2. Have schedule state DONE AND are in non-terminal status (unexpected
+    #    inconsistent state), or
+    # 3. Have no schedule state (legacy) AND are in non-terminal status
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        rows = cursor.execute(
+            f"""\
+            SELECT DISTINCT spot.spot_job_id
+            FROM spot
+            LEFT OUTER JOIN job_info
+            ON spot.spot_job_id=job_info.spot_job_id
+            WHERE (
+                -- non-legacy jobs that are not DONE
+                (job_info.schedule_state IS NOT NULL AND
+                 job_info.schedule_state IS NOT ?)
+                OR
+                -- legacy or that are in non-terminal status or
+                -- DONE jobs that are in non-terminal status
+                ((-- legacy jobs
+                  job_info.schedule_state IS NULL OR
+                  -- non-legacy DONE jobs
+                  job_info.schedule_state IS ?
+                 ) AND
+                 -- non-terminal
+                 status NOT IN ({status_filter_str}))
+            )
+            {job_filter}
+            ORDER BY spot.spot_job_id DESC""", [
+                ManagedJobScheduleState.DONE.value,
+                ManagedJobScheduleState.DONE.value, *terminal_status_values,
+                *job_value
+            ]).fetchall()
+        return [row[0] for row in rows if row[0] is not None]
+def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
+    """Get all job ids by name."""
+    name_filter = ''
+    field_values = []
+    if name is not None:
+        # We match the job name from `job_info` for the jobs submitted after
+        # #1982, and from `spot` for the jobs submitted before #1982, whose
+        # job_info is not available.
+        name_filter = ('WHERE (job_info.name=(?) OR '
+                       '(job_info.name IS NULL AND spot.task_name=(?)))')
+        field_values = [name, name]
+    # Left outer join is used here instead of join, because the job_info does
+    # not contain the managed jobs submitted before #1982.
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        rows = cursor.execute(
+            f"""\
+            SELECT DISTINCT spot.spot_job_id
+            FROM spot
+            LEFT OUTER JOIN job_info
+            ON spot.spot_job_id=job_info.spot_job_id
             {name_filter}
             ORDER BY spot.spot_job_id DESC""", field_values).fetchall()
         job_ids = [row[0] for row in rows if row[0] is not None]
@@ -532,12 +877,14 @@ def get_latest_task_id_status(
     If the job_id does not exist, (None, None) will be returned.
     """
     id_statuses = _get_all_task_ids_statuses(job_id)
-    if len(id_statuses) == 0:
+    if not id_statuses:
         return None, None
-    task_id, status = id_statuses[-1]
-    for task_id, status in id_statuses:
-        if not status.is_terminal():
-            break
+    task_id, status = next(
+        ((tid, st) for tid, st in id_statuses if not st.is_terminal()),
+        id_statuses[-1],
+    )
+    # Unpack the tuple first, or it triggers a Pylint's bug on recognizing
+    # the return type.
     return task_id, status
@@ -558,7 +905,7 @@ def get_failure_reason(job_id: int) -> Optional[str]:
             WHERE spot_job_id=(?)
             ORDER BY task_id ASC""", (job_id,)).fetchall()
         reason = [r[0] for r in reason if r[0] is not None]
-        if len(reason) == 0:
+        if not reason:
             return None
         return reason[0]
@@ -572,6 +919,9 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
     # existing controller before #1982, the job_info table may not exist,
     # and all the managed jobs created before will not present in the
     # job_info.
+    # Note: we will get the user_hash here, but don't try to call
+    # global_user_state.get_user() on it. This runs on the controller, which may
+    # not have the user info. Prefer to do it on the API server side.
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         rows = cursor.execute(f"""\
             SELECT *
@@ -584,6 +934,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
         for row in rows:
             job_dict = dict(zip(columns, row))
             job_dict['status'] = ManagedJobStatus(job_dict['status'])
+            job_dict['schedule_state'] = ManagedJobScheduleState(
+                job_dict['schedule_state'])
             if job_dict['job_name'] is None:
                 job_dict['job_name'] = job_dict['task_name']
             jobs.append(job_dict)
@@ -611,3 +963,156 @@ def get_latest_job_id() -> Optional[int]:
         for (job_id,) in rows:
             return job_id
         return None
+def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        task_specs = cursor.execute(
+            """\
+            SELECT specs FROM spot
+            WHERE spot_job_id=(?) AND task_id=(?)""",
+            (job_id, task_id)).fetchone()
+        return json.loads(task_specs[0])
+def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
+    """Get the local log directory for a job."""
+    filter_str = 'spot_job_id=(?)'
+    filter_args = [job_id]
+    if task_id is not None:
+        filter_str += ' AND task_id=(?)'
+        filter_args.append(task_id)
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        local_log_file = cursor.execute(
+            f'SELECT local_log_file FROM spot '
+            f'WHERE {filter_str}', filter_args).fetchone()
+        return local_log_file[-1] if local_log_file else None
+# === Scheduler state functions ===
+# Only the scheduler should call these functions. They may require holding the
+# scheduler lock to work correctly.
+def scheduler_set_waiting(job_id: int, dag_yaml_path: str, env_file_path: str,
+                          user_hash: str) -> None:
+    """Do not call without holding the scheduler lock."""
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        updated_count = cursor.execute(
+            'UPDATE job_info SET '
+            'schedule_state = (?), dag_yaml_path = (?), env_file_path = (?), '
+            '  user_hash = (?) '
+            'WHERE spot_job_id = (?) AND schedule_state = (?)',
+            (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
+             env_file_path, user_hash, job_id,
+             ManagedJobScheduleState.INACTIVE.value)).rowcount
+        assert updated_count == 1, (job_id, updated_count)
+def scheduler_set_launching(job_id: int,
+                            current_state: ManagedJobScheduleState) -> None:
+    """Do not call without holding the scheduler lock."""
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        updated_count = cursor.execute(
+            'UPDATE job_info SET '
+            'schedule_state = (?) '
+            'WHERE spot_job_id = (?) AND schedule_state = (?)',
+            (ManagedJobScheduleState.LAUNCHING.value, job_id,
+             current_state.value)).rowcount
+        assert updated_count == 1, (job_id, updated_count)
+def scheduler_set_alive(job_id: int) -> None:
+    """Do not call without holding the scheduler lock."""
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        updated_count = cursor.execute(
+            'UPDATE job_info SET '
+            'schedule_state = (?) '
+            'WHERE spot_job_id = (?) AND schedule_state = (?)',
+            (ManagedJobScheduleState.ALIVE.value, job_id,
+             ManagedJobScheduleState.LAUNCHING.value)).rowcount
+        assert updated_count == 1, (job_id, updated_count)
+def scheduler_set_alive_waiting(job_id: int) -> None:
+    """Do not call without holding the scheduler lock."""
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        updated_count = cursor.execute(
+            'UPDATE job_info SET '
+            'schedule_state = (?) '
+            'WHERE spot_job_id = (?) AND schedule_state = (?)',
+            (ManagedJobScheduleState.ALIVE_WAITING.value, job_id,
+             ManagedJobScheduleState.ALIVE.value)).rowcount
+        assert updated_count == 1, (job_id, updated_count)
+def scheduler_set_done(job_id: int, idempotent: bool = False) -> None:
+    """Do not call without holding the scheduler lock."""
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        updated_count = cursor.execute(
+            'UPDATE job_info SET '
+            'schedule_state = (?) '
+            'WHERE spot_job_id = (?) AND schedule_state != (?)',
+            (ManagedJobScheduleState.DONE.value, job_id,
+             ManagedJobScheduleState.DONE.value)).rowcount
+        if not idempotent:
+            assert updated_count == 1, (job_id, updated_count)
+def set_job_controller_pid(job_id: int, pid: int):
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        updated_count = cursor.execute(
+            'UPDATE job_info SET '
+            'controller_pid = (?) '
+            'WHERE spot_job_id = (?)', (pid, job_id)).rowcount
+        assert updated_count == 1, (job_id, updated_count)
+def get_job_schedule_state(job_id: int) -> ManagedJobScheduleState:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        state = cursor.execute(
+            'SELECT schedule_state FROM job_info WHERE spot_job_id = (?)',
+            (job_id,)).fetchone()[0]
+        return ManagedJobScheduleState(state)
+def get_num_launching_jobs() -> int:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        return cursor.execute(
+            'SELECT COUNT(*) '
+            'FROM job_info '
+            'WHERE schedule_state = (?)',
+            (ManagedJobScheduleState.LAUNCHING.value,)).fetchone()[0]
+def get_num_alive_jobs() -> int:
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        return cursor.execute(
+            'SELECT COUNT(*) '
+            'FROM job_info '
+            'WHERE schedule_state IN (?, ?, ?)',
+            (ManagedJobScheduleState.ALIVE_WAITING.value,
+             ManagedJobScheduleState.LAUNCHING.value,
+             ManagedJobScheduleState.ALIVE.value)).fetchone()[0]
+def get_waiting_job() -> Optional[Dict[str, Any]]:
+    """Get the next job that should transition to LAUNCHING.
+    Backwards compatibility note: jobs submitted before #4485 will have no
+    schedule_state and will be ignored by this SQL query.
+    """
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        row = cursor.execute(
+            'SELECT spot_job_id, schedule_state, dag_yaml_path, env_file_path '
+            'FROM job_info '
+            'WHERE schedule_state in (?, ?) '
+            'ORDER BY spot_job_id LIMIT 1',
+            (ManagedJobScheduleState.WAITING.value,
+             ManagedJobScheduleState.ALIVE_WAITING.value)).fetchone()
+        return {
+            'job_id': row[0],
+            'schedule_state': ManagedJobScheduleState(row[1]),
+            'dag_yaml_path': row[2],
+            'env_file_path': row[3],
+        } if row is not None else None

skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

skypilot-nightly 1.0.0.dev2024053101py3-none-any.whl → 1.0.0.dev2025022801py3-none-any.whl