PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250616py3-none-any.whl → 1.0.0.dev20250617py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sky/__init__.py +2 -4
sky/backends/cloud_vm_ray_backend.py +43 -60
sky/cli.py +55 -637
sky/client/cli.py +55 -637
sky/clouds/kubernetes.py +3 -0
sky/clouds/scp.py +7 -26
sky/clouds/utils/scp_utils.py +177 -124
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_buildManifest.js +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/jobs/controller.py +98 -31
sky/jobs/scheduler.py +37 -29
sky/jobs/server/core.py +36 -3
sky/jobs/state.py +69 -9
sky/jobs/utils.py +11 -0
sky/provision/__init__.py +1 -0
sky/provision/scp/__init__.py +15 -0
sky/provision/scp/config.py +93 -0
sky/provision/scp/instance.py +528 -0
sky/resources.py +164 -29
sky/skylet/constants.py +39 -0
sky/skylet/job_lib.py +8 -0
sky/task.py +171 -21
sky/templates/kubernetes-ray.yml.j2 +51 -4
sky/templates/scp-ray.yml.j2 +3 -50
sky/users/permission.py +19 -36
sky/utils/command_runner.py +1 -1
sky/utils/common_utils.py +16 -14
sky/utils/context.py +1 -1
sky/utils/controller_utils.py +12 -3
sky/utils/dag_utils.py +17 -4
sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
sky/utils/schemas.py +43 -5
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/METADATA +1 -1
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/RECORD +54 -57
sky/benchmark/__init__.py +0 -0
sky/benchmark/benchmark_state.py +0 -295
sky/benchmark/benchmark_utils.py +0 -641
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
sky/skylet/providers/scp/__init__.py +0 -2
sky/skylet/providers/scp/config.py +0 -149
sky/skylet/providers/scp/node_provider.py +0 -578
/sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → vA3PPpkBwpRTRNBHFYAw_}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250617.dist-info}/top_level.txt +0 -0

sky/jobs/state.py CHANGED Viewed

@@ -352,6 +352,16 @@ class ManagedJobStatus(enum.Enum):
             cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER
         ]
+    @classmethod
+    def processing_statuses(cls) -> List['ManagedJobStatus']:
+        # Any status that is not terminal and is not CANCELLING.
+        return [
+            cls.PENDING,
+            cls.STARTING,
+            cls.RUNNING,
+            cls.RECOVERING,
+        ]
 _SPOT_STATUS_TO_COLOR = {
     ManagedJobStatus.PENDING: colorama.Fore.BLUE,
@@ -607,21 +617,49 @@ def set_started(job_id: int, task_id: int, start_time: float,
 @_init_db
-def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
+def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
+                   callback_func: CallbackType):
     """Set the task to recovering state, and update the job duration."""
     assert _DB_PATH is not None
     logger.info('=== Recovering... ===')
+    expected_status: List[str] = [ManagedJobStatus.RUNNING.value]
+    status_str = 'status=(?)'
+    if force_transit_to_recovering:
+        # For the HA job controller, it is possible that the jobs came from any
+        # processing status to recovering. But it should not be any terminal
+        # status as such jobs will not be recovered; and it should not be
+        # CANCELLING as we will directly trigger a cleanup.
+        expected_status = [
+            s.value for s in ManagedJobStatus.processing_statuses()
+        ]
+        question_mark_str = ', '.join(['?'] * len(expected_status))
+        status_str = f'status IN ({question_mark_str})'
+    # NOTE: if we are resuming from a controller failure and the previous status
+    # is STARTING, the initial value of `last_recovered_at` might not be set
+    # yet (default value -1). In this case, we should not add current timestamp.
+    # Otherwise, the job duration will be incorrect (~55 years from 1970).
+    current_time = time.time()
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         cursor.execute(
-            """\
+            f"""\
                 UPDATE spot SET
-                status=(?), job_duration=job_duration+(?)-last_recovered_at
+                status=(?),
+                job_duration=CASE
+                    WHEN last_recovered_at >= 0
+                    THEN job_duration+(?)-last_recovered_at
+                    ELSE job_duration
+                END,
+                last_recovered_at=CASE
+                    WHEN last_recovered_at < 0
+                    THEN (?)
+                    ELSE last_recovered_at
+                END
                 WHERE spot_job_id=(?) AND
                 task_id=(?) AND
-                status=(?) AND
+                {status_str} AND
                 end_at IS null""",
-            (ManagedJobStatus.RECOVERING.value, time.time(), job_id, task_id,
-             ManagedJobStatus.RUNNING.value))
+            (ManagedJobStatus.RECOVERING.value, current_time, current_time,
+             job_id, task_id, *expected_status))
         if cursor.rowcount != 1:
             raise exceptions.ManagedJobStatusError(
                 f'Failed to set the task to recovering. '
@@ -996,6 +1034,19 @@ def _get_all_task_ids_statuses(
         return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
+@_init_db
+def get_job_status_with_task_id(job_id: int,
+                                task_id: int) -> Optional[ManagedJobStatus]:
+    assert _DB_PATH is not None
+    with db_utils.safe_cursor(_DB_PATH) as cursor:
+        status = cursor.execute(
+            """\
+            SELECT status FROM spot
+            WHERE spot_job_id=(?) AND task_id=(?)""",
+            (job_id, task_id)).fetchone()
+        return ManagedJobStatus(status[0]) if status else None
 def get_num_tasks(job_id: int) -> int:
     return len(_get_all_task_ids_statuses(job_id))
@@ -1156,8 +1207,15 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
 @_init_db
 def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
                           original_user_yaml_path: str, env_file_path: str,
-                          user_hash: str, priority: int) -> None:
-    """Do not call without holding the scheduler lock."""
+                          user_hash: str, priority: int) -> bool:
+    """Do not call without holding the scheduler lock.
+    Returns: Whether this is a recovery run or not.
+        If this is a recovery run, the job may already be in the WAITING
+        state and the update will not change the schedule_state (hence the
+        updated_count will be 0). In this case, we return True.
+        Otherwise, we return False.
+    """
     assert _DB_PATH is not None
     with db_utils.safe_cursor(_DB_PATH) as cursor:
         updated_count = cursor.execute(
@@ -1169,7 +1227,9 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
             (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
              original_user_yaml_path, env_file_path, user_hash, priority,
              job_id, ManagedJobScheduleState.INACTIVE.value)).rowcount
-        assert updated_count == 1, (job_id, updated_count)
+        # For a recovery run, the job may already be in the WAITING state.
+        assert updated_count <= 1, (job_id, updated_count)
+        return updated_count == 0
 @_init_db

sky/jobs/utils.py CHANGED Viewed

@@ -176,6 +176,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
     Note: we expect that job_id, if provided, refers to a nonterminal job or a
     job that has not completed its cleanup (schedule state not DONE).
     """
+    # This signal file suggests that the controller is recovering from a
+    # failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
+    # When restarting the controller processes, we don't want this event to
+    # set the job status to FAILED_CONTROLLER.
+    # TODO(tian): Change this to restart the controller process. For now we
+    # disabled it when recovering because we want to avoid caveats of infinite
+    # restart of last controller process that fully occupied the controller VM.
+    if os.path.exists(
+            os.path.expanduser(
+                constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
+        return
     def _cleanup_job_clusters(job_id: int) -> Optional[str]:
         """Clean up clusters for a job. Returns error message if any.

sky/provision/__init__.py CHANGED Viewed

@@ -24,6 +24,7 @@ from sky.provision import lambda_cloud
 from sky.provision import nebius
 from sky.provision import oci
 from sky.provision import runpod
+from sky.provision import scp
 from sky.provision import ssh
 from sky.provision import vast
 from sky.provision import vsphere

sky/provision/scp/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""SCP provisioner for SkyPilot."""
+from sky.provision.scp.config import bootstrap_instances
+from sky.provision.scp.instance import cleanup_ports
+from sky.provision.scp.instance import get_cluster_info
+from sky.provision.scp.instance import open_ports
+from sky.provision.scp.instance import query_instances
+from sky.provision.scp.instance import run_instances
+from sky.provision.scp.instance import stop_instances
+from sky.provision.scp.instance import terminate_instances
+from sky.provision.scp.instance import wait_instances
+__all__ = ('bootstrap_instances', 'cleanup_ports', 'get_cluster_info',
+           'open_ports', 'query_instances', 'run_instances', 'stop_instances',
+           'terminate_instances', 'wait_instances')

sky/provision/scp/config.py ADDED Viewed

@@ -0,0 +1,93 @@
+"""SCP configuration bootstrapping."""
+import subprocess
+from sky.clouds.utils import scp_utils
+from sky.provision import common
+def bootstrap_instances(
+        region: str, cluster_name: str,
+        config: common.ProvisionConfig) -> common.ProvisionConfig:
+    """Bootstraps instances for the given cluster."""
+    del cluster_name
+    node_cfg = config.node_config
+    zone_id = _get_zone_id(region)
+    node_cfg['zone_id'] = zone_id
+    docker_cfg = config.docker_config
+    docker_cfg['imageId'] = node_cfg['imageId']
+    docker_cfg['serviceZoneId'] = zone_id
+    docker_cfg['serverType'] = node_cfg['InstanceType']
+    docker_cfg['contractId'] = 'None'
+    ssh_public_key = node_cfg['AuthorizedKey']
+    docker_cfg['initialScript'] = _get_init_script(ssh_public_key)
+    key_pair_id = _get_key_pair_id()
+    miscellaneous = {
+        'deletionProtectionEnabled': False,
+        'keyPairId': key_pair_id,
+        'blockStorage': {
+            'blockStorageName': 'skystorage',
+            'diskSize': node_cfg['diskSize'],
+            'encryptEnabled': False,
+            'productId': 'PRODUCT-sRlJ34iBr9hOxN9J5PrQxo'
+        },
+        'nic': {
+            'natEnabled': True
+        },
+    }
+    docker_cfg.update(miscellaneous)
+    return config
+def _get_zone_id(region_name: str):
+    zone_contents = scp_utils.SCPClient().get_zones()
+    zone_dict = {
+        item['serviceZoneName']: item['serviceZoneId'] for item in zone_contents
+    }
+    return zone_dict[region_name]
+def _get_init_script(ssh_public_key: str):
+    init_script_content = _get_default_config_cmd() + _get_ssh_key_gen_cmd(
+        ssh_public_key)
+    init_script_content_string = f'"{init_script_content}"'
+    command = f'echo {init_script_content_string} | base64'
+    result = subprocess.run(command,
+                            shell=True,
+                            capture_output=True,
+                            text=True,
+                            check=True)
+    init_script_content_base64 = result.stdout
+    return {
+        'encodingType': 'base64',
+        'initialScriptShell': 'bash',
+        'initialScriptType': 'text',
+        'initialScriptContent': init_script_content_base64
+    }
+def _get_default_config_cmd():
+    cmd_list = ['apt-get update', 'apt-get -y install python3-pip']
+    res = ''
+    for cmd in cmd_list:
+        res += cmd + '; '
+    return res
+def _get_ssh_key_gen_cmd(ssh_public_key: str):
+    cmd_st = 'mkdir -p ~/.ssh/; touch ~/.ssh/authorized_keys;'
+    cmd_ed = 'chmod 644 ~/.ssh/authorized_keys; chmod 700 ~/.ssh/'
+    cmd = "echo '{}' &>>~/.ssh/authorized_keys;".format(ssh_public_key)  # pylint: disable=invalid-string-quote
+    return cmd_st + cmd + cmd_ed
+def _get_key_pair_id():
+    key_pairs = scp_utils.SCPClient().get_key_pairs()
+    if key_pairs['totalCount'] == 0:
+        raise RuntimeError('create key pair')
+    return key_pairs['contents'][0]['keyPairId']

skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250617__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250616py3-none-any.whl → 1.0.0.dev20250617py3-none-any.whl