PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250203__py3-none-any.whl → 1.0.0.dev20250205__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250203py3-none-any.whl → 1.0.0.dev20250205py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

sky/__init__.py +4 -2
sky/adaptors/vast.py +29 -0
sky/authentication.py +18 -0
sky/backends/backend_utils.py +4 -1
sky/backends/cloud_vm_ray_backend.py +1 -0
sky/check.py +2 -2
sky/clouds/__init__.py +2 -0
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
sky/clouds/service_catalog/kubernetes_catalog.py +11 -6
sky/clouds/service_catalog/vast_catalog.py +104 -0
sky/clouds/vast.py +279 -0
sky/jobs/dashboard/dashboard.py +156 -20
sky/jobs/dashboard/templates/index.html +557 -78
sky/jobs/scheduler.py +14 -5
sky/provision/__init__.py +1 -0
sky/provision/lambda_cloud/instance.py +17 -1
sky/provision/vast/__init__.py +10 -0
sky/provision/vast/config.py +11 -0
sky/provision/vast/instance.py +247 -0
sky/provision/vast/utils.py +161 -0
sky/serve/serve_state.py +23 -21
sky/setup_files/dependencies.py +1 -0
sky/templates/vast-ray.yml.j2 +70 -0
sky/utils/controller_utils.py +5 -0
{skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/METADATA +4 -1
{skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/RECORD +31 -22
{skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250203.dist-info → skypilot_nightly-1.0.0.dev20250205.dist-info}/top_level.txt +0 -0

sky/clouds/vast.py ADDED Viewed

@@ -0,0 +1,279 @@
+""" Vast Cloud. """
+import typing
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+from sky import clouds
+from sky.clouds import service_catalog
+from sky.utils import resources_utils
+if typing.TYPE_CHECKING:
+    from sky import resources as resources_lib
+@clouds.CLOUD_REGISTRY.register
+class Vast(clouds.Cloud):
+    """ Vast GPU Cloud
+    _REPR | The string representation for the Vast GPU cloud object.
+    """
+    _REPR = 'Vast'
+    _CLOUD_UNSUPPORTED_FEATURES = {
+        clouds.CloudImplementationFeatures.MULTI_NODE:
+            ('Multi-node not supported yet, as the interconnection among nodes '
+             'are non-trivial on Vast.'),
+        clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
+            ('Customizing disk tier is not supported yet on Vast.'),
+        clouds.CloudImplementationFeatures.OPEN_PORTS:
+            ('Opening ports is currently not supported on Vast.'),
+        clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
+            ('Mounting object stores is not supported on Vast.'),
+    }
+    #
+    # Vast doesn't have a max cluster name limit. This number
+    # is reasonably large and exists to play nicely with the
+    # other providers
+    #
+    _MAX_CLUSTER_NAME_LEN_LIMIT = 120
+    _regions: List[clouds.Region] = []
+    PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
+    STATUS_VERSION = clouds.StatusVersion.SKYPILOT
+    @classmethod
+    def _unsupported_features_for_resources(
+        cls, resources: 'resources_lib.Resources'
+    ) -> Dict[clouds.CloudImplementationFeatures, str]:
+        """The features not supported based on the resources provided.
+        This method is used by check_features_are_supported() to check if the
+        cloud implementation supports all the requested features.
+        Returns:
+            A dict of {feature: reason} for the features not supported by the
+            cloud implementation.
+        """
+        del resources  # unused
+        return cls._CLOUD_UNSUPPORTED_FEATURES
+    @classmethod
+    def _max_cluster_name_length(cls) -> Optional[int]:
+        return cls._MAX_CLUSTER_NAME_LEN_LIMIT
+    @classmethod
+    def regions_with_offering(cls, instance_type: str,
+                              accelerators: Optional[Dict[str, int]],
+                              use_spot: bool, region: Optional[str],
+                              zone: Optional[str]) -> List[clouds.Region]:
+        assert zone is None, 'Vast does not support zones.'
+        del accelerators, zone  # unused
+        regions = service_catalog.get_region_zones_for_instance_type(
+            instance_type, use_spot, 'vast')
+        if region is not None:
+            regions = [r for r in regions if r.name == region]
+        return regions
+    @classmethod
+    def get_vcpus_mem_from_instance_type(
+        cls,
+        instance_type: str,
+    ) -> Tuple[Optional[float], Optional[float]]:
+        return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
+                                                                clouds='vast')
+    @classmethod
+    def zones_provision_loop(
+        cls,
+        *,
+        region: str,
+        num_nodes: int,
+        instance_type: str,
+        accelerators: Optional[Dict[str, int]] = None,
+        use_spot: bool = False,
+    ) -> Iterator[None]:
+        del num_nodes  # unused
+        regions = cls.regions_with_offering(instance_type,
+                                            accelerators,
+                                            use_spot,
+                                            region=region,
+                                            zone=None)
+        for r in regions:
+            assert r.zones is None, r
+            yield r.zones
+    def instance_type_to_hourly_cost(self,
+                                     instance_type: str,
+                                     use_spot: bool,
+                                     region: Optional[str] = None,
+                                     zone: Optional[str] = None) -> float:
+        return service_catalog.get_hourly_cost(instance_type,
+                                               use_spot=use_spot,
+                                               region=region,
+                                               zone=zone,
+                                               clouds='vast')
+    def accelerators_to_hourly_cost(self,
+                                    accelerators: Dict[str, int],
+                                    use_spot: bool,
+                                    region: Optional[str] = None,
+                                    zone: Optional[str] = None) -> float:
+        """Returns the hourly cost of the accelerators, in dollars/hour."""
+        del accelerators, use_spot, region, zone  # unused
+        return 0.0  # Vast includes accelerators in the hourly cost.
+    def get_egress_cost(self, num_gigabytes: float) -> float:
+        return 0.0
+    @classmethod
+    def get_default_instance_type(
+            cls,
+            cpus: Optional[str] = None,
+            memory: Optional[str] = None,
+            disk_tier: Optional[resources_utils.DiskTier] = None
+    ) -> Optional[str]:
+        """Returns the default instance type for Vast."""
+        return service_catalog.get_default_instance_type(cpus=cpus,
+                                                         memory=memory,
+                                                         disk_tier=disk_tier,
+                                                         clouds='vast')
+    @classmethod
+    def get_accelerators_from_instance_type(
+            cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
+        return service_catalog.get_accelerators_from_instance_type(
+            instance_type, clouds='vast')
+    @classmethod
+    def get_zone_shell_cmd(cls) -> Optional[str]:
+        return None
+    def make_deploy_resources_variables(
+            self,
+            resources: 'resources_lib.Resources',
+            cluster_name: resources_utils.ClusterName,
+            region: 'clouds.Region',
+            zones: Optional[List['clouds.Zone']],
+            num_nodes: int,
+            dryrun: bool = False) -> Dict[str, Optional[str]]:
+        del zones, dryrun, cluster_name, num_nodes  # unused
+        r = resources
+        acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
+        custom_resources = resources_utils.make_ray_custom_resources_str(
+            acc_dict)
+        if r.image_id is None:
+            image_id = 'vastai/base:0.0.2'
+        elif r.extract_docker_image() is not None:
+            image_id = r.extract_docker_image()
+        else:
+            image_id = r.image_id[r.region]
+        return {
+            'instance_type': resources.instance_type,
+            'custom_resources': custom_resources,
+            'region': region.name,
+            'image_id': image_id,
+        }
+    def _get_feasible_launchable_resources(
+        self, resources: 'resources_lib.Resources'
+    ) -> 'resources_utils.FeasibleResources':
+        """Returns a list of feasible resources for the given resources."""
+        if resources.instance_type is not None:
+            assert resources.is_launchable(), resources
+            resources = resources.copy(accelerators=None)
+            return resources_utils.FeasibleResources([resources], [], None)
+        def _make(instance_list):
+            resource_list = []
+            for instance_type in instance_list:
+                r = resources.copy(
+                    cloud=Vast(),
+                    instance_type=instance_type,
+                    accelerators=None,
+                    cpus=None,
+                )
+                resource_list.append(r)
+            return resource_list
+        # Currently, handle a filter on accelerators only.
+        accelerators = resources.accelerators
+        if accelerators is None:
+            # Return a default instance type
+            default_instance_type = Vast.get_default_instance_type(
+                cpus=resources.cpus,
+                memory=resources.memory,
+                disk_tier=resources.disk_tier)
+            if default_instance_type is None:
+                # TODO: Add hints to all return values in this method to help
+                #  users understand why the resources are not launchable.
+                return resources_utils.FeasibleResources([], [], None)
+            else:
+                return resources_utils.FeasibleResources(
+                    _make([default_instance_type]), [], None)
+        assert len(accelerators) == 1, resources
+        acc, acc_count = list(accelerators.items())[0]
+        (instance_list, fuzzy_candidate_list
+        ) = service_catalog.get_instance_type_for_accelerator(
+            acc,
+            acc_count,
+            use_spot=resources.use_spot,
+            cpus=resources.cpus,
+            region=resources.region,
+            zone=resources.zone,
+            memory=resources.memory,
+            clouds='vast')
+        if instance_list is None:
+            return resources_utils.FeasibleResources([], fuzzy_candidate_list,
+                                                     None)
+        return resources_utils.FeasibleResources(_make(instance_list),
+                                                 fuzzy_candidate_list, None)
+    @classmethod
+    def check_credentials(cls) -> Tuple[bool, Optional[str]]:
+        """ Verify that the user has valid credentials for Vast. """
+        try:
+            import vastai_sdk as _vast  # pylint: disable=import-outside-toplevel
+            vast = _vast.VastAI()
+            # We only support file pased credential passing
+            if vast.creds_source != 'FILE':
+                return False, (
+                    'error \n'  # First line is indented by 4 spaces
+                    '    Credentials can be set up by running: \n'
+                    '        $ pip install vastai\n'
+                    '        $ echo [key] > ~/.vast_api_key\n'
+                    '    For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast'  # pylint: disable=line-too-long
+                )
+            return True, None
+        except ImportError:
+            return False, ('Failed to import vast. '
+                           'To install, run: pip install skypilot[vast]')
+    def get_credential_file_mounts(self) -> Dict[str, str]:
+        return {
+            '~/.config/vastai/vast_api_key': '~/.config/vastai/vast_api_key'
+        }
+    @classmethod
+    def get_user_identities(cls) -> Optional[List[List[str]]]:
+        # NOTE: used for very advanced SkyPilot functionality
+        # Can implement later if desired
+        return None
+    def instance_type_exists(self, instance_type: str) -> bool:
+        return service_catalog.instance_type_exists(instance_type, 'vast')
+    def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
+        return service_catalog.validate_region_zone(region, zone, clouds='vast')
+    @classmethod
+    def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
+        # TODO: use 0.0 for now to allow all images. We should change this to
+        # return the docker image size.
+        return 0.0

sky/jobs/dashboard/dashboard.py CHANGED Viewed

@@ -6,13 +6,17 @@ https://github.com/ray-project/ray/tree/master/dashboard/client/src) and/or get
 rid of the SSH port-forwarding business (see cli.py's job_dashboard()
 comment).
 """
+import collections
 import datetime
+import enum
+import os
 import pathlib
 import flask
 import yaml
 from sky import jobs as managed_jobs
+from sky.jobs import constants as managed_job_constants
 from sky.utils import common_utils
 from sky.utils import controller_utils
@@ -41,6 +45,92 @@ def _is_running_on_jobs_controller() -> bool:
     return False
+# Column indices for job table
+class JobTableColumns(enum.IntEnum):
+    """Column indices for the jobs table in the dashboard.
+    - DROPDOWN (0): Column for expandable dropdown arrow
+    - ID (1): Job ID column
+    - TASK (2): Task name/number column
+    - NAME (3): Job name column
+    - RESOURCES (4): Resources used by job
+    - SUBMITTED (5): Job submission timestamp
+    - TOTAL_DURATION (6): Total time since job submission
+    - JOB_DURATION (7): Actual job runtime
+    - RECOVERIES (8): Number of job recoveries
+    - STATUS (9): Current job status
+    - STARTED (10): Job start timestamp
+    - CLUSTER (11): Cluster name
+    - REGION (12): Cloud region
+    - FAILOVER (13): Job failover history
+    - DETAILS (14): Job details
+    - ACTIONS (15): Available actions column
+    """
+    DROPDOWN = 0
+    ID = 1
+    TASK = 2
+    NAME = 3
+    RESOURCES = 4
+    SUBMITTED = 5
+    TOTAL_DURATION = 6
+    JOB_DURATION = 7
+    RECOVERIES = 8
+    STATUS = 9
+    STARTED = 10
+    CLUSTER = 11
+    REGION = 12
+    DETAILS = 13
+    FAILOVER = 14
+    ACTIONS = 15
+# Column headers matching the indices above
+JOB_TABLE_COLUMNS = [
+    '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
+    'Job Duration', 'Status', 'Started', 'Cluster', 'Region', 'Failover',
+    'Recoveries', 'Details', 'Actions'
+]
+def _extract_launch_history(log_content: str) -> str:
+    """Extract launch history from log content.
+    Args:
+        log_content: Content of the log file.
+    Returns:
+        A formatted string containing the launch history.
+    """
+    launches = []
+    current_launch = None
+    for line in log_content.splitlines():
+        if 'Launching on' in line:
+            try:
+                parts = line.split(']')
+                if len(parts) >= 2:
+                    timestamp = parts[0].split()[1:3]
+                    message = parts[1].replace('[0m⚙︎', '').strip()
+                    formatted_line = f'{" ".join(timestamp)} {message}'
+                    if current_launch:
+                        prev_time, prev_target = current_launch.rsplit(
+                            ' Launching on ', 1)
+                        launches.append(
+                            f'{prev_time} Tried to launch on {prev_target}')
+                    # Store the current launch
+                    current_launch = formatted_line
+            except IndexError:
+                launches.append(line.strip())
+    # Add the final (successful) launch at the beginning
+    if current_launch:
+        result = [current_launch]
+        result.extend(launches)
+        return '\n'.join(result)
+    return 'No launch history found'
 @app.route('/')
 def home():
     if not _is_running_on_jobs_controller():
@@ -54,38 +144,84 @@ def home():
     rows = managed_jobs.format_job_table(all_managed_jobs,
                                          show_all=True,
                                          return_rows=True)
-    # Add an empty column for the dropdown button. This will be added in the
-    # jobs/templates/index.html file.
-    rows = [[''] + row for row in rows]
-    # FIXME(zongheng): make the job table/queue funcs return structured info so
-    # that we don't have to do things like row[-5] below.
-    columns = [
-        '', 'ID', 'Task', 'Name', 'Resources', 'Submitted', 'Total Duration',
-        'Job Duration', 'Recoveries', 'Status', 'Started', 'Cluster', 'Region',
-        'Failure'
-    ]
-    if rows and len(rows[0]) != len(columns):
+    status_counts = collections.defaultdict(int)
+    for task in all_managed_jobs:
+        if not task['status'].is_terminal():
+            status_counts[task['status'].value] += 1
+    # Add an empty column for the dropdown button and actions column
+    rows = [[''] + row + [''] + [''] for row in rows
+           ]  # Add empty cell for failover and actions column
+    # Add log content as failover history for each job
+    for row in rows:
+        job_id = str(row[JobTableColumns.ID]).strip().replace(' ⤳', '')
+        if job_id and job_id != '-':
+            try:
+                log_path = os.path.join(
+                    os.path.expanduser(
+                        managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
+                    f'{job_id}.log')
+                if os.path.exists(log_path):
+                    with open(log_path, 'r', encoding='utf-8') as f:
+                        log_content = f.read()
+                        row[JobTableColumns.FAILOVER] = _extract_launch_history(
+                            log_content)
+                else:
+                    row[JobTableColumns.FAILOVER] = 'Log file not found'
+            except (IOError, OSError) as e:
+                row[JobTableColumns.FAILOVER] = f'Error reading log: {str(e)}'
+    app.logger.error('All managed jobs:')
+    # Validate column count
+    if rows and len(rows[0]) != len(JOB_TABLE_COLUMNS):
         raise RuntimeError(
-            'Dashboard code and managed job queue code are out of sync.')
+            f'Dashboard code and managed job queue code are out of sync. '
+            f'Expected {(JOB_TABLE_COLUMNS)} columns, got {(rows[0])}')
-    # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'.
+    # Fix STATUS color codes: '\x1b[33mCANCELLED\x1b[0m' -> 'CANCELLED'
     for row in rows:
-        row[-5] = common_utils.remove_color(row[-5])
-    # Remove filler rows ([''], ..., ['-']).
-    rows = [row for row in rows if ''.join(map(str, row)) != '']
+        row[JobTableColumns.STATUS] = common_utils.remove_color(
+            row[JobTableColumns.STATUS])
+    # Remove filler rows ([''], ..., ['-'])
+    rows = [
+        row for row in rows
+        if ''.join(map(str, row[:JobTableColumns.ACTIONS])) != ''
+    ]
+    # Get all unique status values
+    status_values = sorted(
+        list(set(row[JobTableColumns.STATUS] for row in rows)))
-    # Get all unique status values.
-    status_values = sorted(list(set(row[-5] for row in rows)))
     rendered_html = flask.render_template(
         'index.html',
-        columns=columns,
+        columns=JOB_TABLE_COLUMNS,
         rows=rows,
         last_updated_timestamp=timestamp,
         status_values=status_values,
+        status_counts=status_counts,
     )
     return rendered_html
+@app.route('/download_log/<job_id>')
+def download_log(job_id):
+    try:
+        log_path = os.path.join(
+            os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
+            f'{job_id}.log')
+        if not os.path.exists(log_path):
+            flask.abort(404)
+        return flask.send_file(log_path,
+                               mimetype='text/plain',
+                               as_attachment=True,
+                               download_name=f'job_{job_id}.log')
+    except (IOError, OSError) as e:
+        app.logger.error(f'Error downloading log for job {job_id}: {str(e)}')
+        flask.abort(500)
 if __name__ == '__main__':
     app.run()

skypilot-nightly 1.0.0.dev20250203__py3-none-any.whl → 1.0.0.dev20250205__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250203py3-none-any.whl → 1.0.0.dev20250205py3-none-any.whl