PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20241227py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sky/__init__.py +2 -2
sky/adaptors/common.py +15 -9
sky/adaptors/do.py +20 -0
sky/adaptors/oci.py +32 -1
sky/authentication.py +20 -8
sky/backends/backend_utils.py +44 -0
sky/backends/cloud_vm_ray_backend.py +202 -41
sky/backends/wheel_utils.py +4 -1
sky/check.py +31 -1
sky/cli.py +39 -43
sky/cloud_stores.py +71 -2
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +137 -50
sky/clouds/cloud.py +4 -0
sky/clouds/do.py +303 -0
sky/clouds/gcp.py +9 -0
sky/clouds/kubernetes.py +3 -3
sky/clouds/oci.py +20 -9
sky/clouds/service_catalog/__init__.py +7 -3
sky/clouds/service_catalog/constants.py +1 -1
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +10 -51
sky/clouds/service_catalog/do_catalog.py +111 -0
sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
sky/clouds/utils/oci_utils.py +15 -2
sky/core.py +8 -5
sky/data/data_transfer.py +37 -0
sky/data/data_utils.py +19 -4
sky/data/mounting_utils.py +99 -15
sky/data/storage.py +961 -130
sky/global_user_state.py +1 -1
sky/jobs/__init__.py +2 -0
sky/jobs/constants.py +8 -7
sky/jobs/controller.py +19 -22
sky/jobs/core.py +46 -2
sky/jobs/recovery_strategy.py +114 -143
sky/jobs/scheduler.py +283 -0
sky/jobs/state.py +290 -21
sky/jobs/utils.py +346 -95
sky/optimizer.py +6 -3
sky/provision/aws/config.py +59 -29
sky/provision/azure/instance.py +1 -1
sky/provision/do/__init__.py +11 -0
sky/provision/do/config.py +14 -0
sky/provision/do/constants.py +10 -0
sky/provision/do/instance.py +287 -0
sky/provision/do/utils.py +306 -0
sky/provision/docker_utils.py +22 -11
sky/provision/gcp/instance_utils.py +15 -9
sky/provision/kubernetes/instance.py +3 -2
sky/provision/kubernetes/utils.py +125 -20
sky/provision/oci/query_utils.py +17 -14
sky/provision/provisioner.py +0 -1
sky/provision/runpod/instance.py +10 -1
sky/provision/runpod/utils.py +170 -13
sky/resources.py +1 -1
sky/serve/autoscalers.py +359 -301
sky/serve/controller.py +10 -8
sky/serve/core.py +84 -7
sky/serve/load_balancer.py +27 -10
sky/serve/replica_managers.py +1 -3
sky/serve/serve_state.py +10 -5
sky/serve/serve_utils.py +28 -1
sky/serve/service.py +4 -3
sky/serve/service_spec.py +31 -0
sky/setup_files/dependencies.py +4 -1
sky/skylet/constants.py +8 -4
sky/skylet/events.py +7 -3
sky/skylet/job_lib.py +10 -30
sky/skylet/log_lib.py +8 -8
sky/skylet/log_lib.pyi +3 -0
sky/skylet/providers/command_runner.py +5 -7
sky/skylet/skylet.py +1 -1
sky/task.py +28 -1
sky/templates/do-ray.yml.j2 +98 -0
sky/templates/jobs-controller.yaml.j2 +41 -7
sky/templates/runpod-ray.yml.j2 +13 -0
sky/templates/sky-serve-controller.yaml.j2 +4 -0
sky/usage/usage_lib.py +10 -2
sky/utils/accelerator_registry.py +12 -8
sky/utils/controller_utils.py +114 -39
sky/utils/db_utils.py +18 -4
sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
sky/utils/log_utils.py +2 -0
sky/utils/resources_utils.py +25 -21
sky/utils/schemas.py +27 -0
sky/utils/subprocess_utils.py +54 -10
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +23 -4
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +92 -82
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +1 -1
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20241227.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0

sky/provision/aws/config.py CHANGED Viewed

@@ -383,10 +383,13 @@ def _usable_subnets(
         raise exc
     if not subnets:
+        vpc_msg = (f'Does a default VPC exist in region '
+                   f'{ec2.meta.client.meta.region_name}? ') if (
+                       vpc_id_of_sg is None) else ''
         _skypilot_log_error_and_exit_for_failover(
-            'No usable subnets found, try '
-            'manually creating an instance in your specified region to '
-            'populate the list of subnets and trying this again. '
+            f'No usable subnets found. {vpc_msg}'
+            'Try manually creating an instance in your specified region to '
+            'populate the list of subnets and try again. '
             'Note that the subnet must map public IPs '
             'on instance launch unless you set `use_internal_ips: true` in '
             'the `provider` config.')
@@ -495,6 +498,11 @@ def _get_subnet_and_vpc_id(ec2, security_group_ids: Optional[List[str]],
         vpc_id_of_sg = None
     all_subnets = list(ec2.subnets.all())
+    # If no VPC is specified, use the default VPC.
+    # We filter only for default VPCs to avoid using subnets that users may
+    # not want SkyPilot to use.
+    if vpc_id_of_sg is None:
+        all_subnets = [s for s in all_subnets if s.vpc.is_default]
     subnets, vpc_id = _usable_subnets(
         ec2,
         user_specified_subnets=None,
@@ -545,17 +553,28 @@ def _configure_security_group(ec2, vpc_id: str, expected_sg_name: str,
 def _get_or_create_vpc_security_group(ec2, vpc_id: str,
                                       expected_sg_name: str) -> Any:
-    # Figure out which security groups with this name exist for each VPC...
-    vpc_to_existing_sg = {
-        sg.vpc_id: sg for sg in _get_security_groups_from_vpc_ids(
-            ec2,
-            [vpc_id],
-            [expected_sg_name],
-        )
-    }
+    """Find or create a security group in the specified VPC.
-    if vpc_id in vpc_to_existing_sg:
-        return vpc_to_existing_sg[vpc_id]
+    Args:
+        ec2: The initialized EC2 client object.
+        vpc_id: The ID of the VPC where the security group should be queried
+            or created.
+        expected_sg_name: The expected name of the security group.
+    Returns:
+        The security group object containing the details of the security group.
+    Raises:
+        exceptions.NoClusterLaunchedError: If the security group creation fails
+            and is not due to an existing duplicate.
+        botocore.exceptions.ClientError: If the security group creation fails
+            due to AWS service issues.
+    """
+    # Figure out which security groups with this name exist for each VPC...
+    security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
+                                                     expected_sg_name)
+    if security_group is not None:
+        return security_group
     try:
         # create a new security group
@@ -565,34 +584,45 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
             VpcId=vpc_id,
         )
     except ec2.meta.client.exceptions.ClientError as e:
+        if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
+            # The security group already exists, but we didn't see it
+            # because of eventual consistency.
+            logger.warning(f'{expected_sg_name} already exists when creating.')
+            security_group = _get_security_group_from_vpc_id(
+                ec2, vpc_id, expected_sg_name)
+            assert (security_group is not None and
+                    security_group.group_name == expected_sg_name), (
+                        f'Expected {expected_sg_name} but got {security_group}')
+            logger.info(
+                f'Found existing security group {colorama.Style.BRIGHT}'
+                f'{security_group.group_name}{colorama.Style.RESET_ALL} '
+                f'[id={security_group.id}]')
+            return security_group
         message = ('Failed to create security group. Error: '
                    f'{common_utils.format_exception(e)}')
         logger.warning(message)
         raise exceptions.NoClusterLaunchedError(message) from e
-    security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
-                                                       [expected_sg_name])
-    assert security_group, 'Failed to create security group'
-    security_group = security_group[0]
+    security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
+                                                     expected_sg_name)
+    assert security_group is not None, 'Failed to create security group'
     logger.info(f'Created new security group {colorama.Style.BRIGHT}'
                 f'{security_group.group_name}{colorama.Style.RESET_ALL} '
                 f'[id={security_group.id}]')
     return security_group
-def _get_security_groups_from_vpc_ids(ec2, vpc_ids: List[str],
-                                      group_names: List[str]) -> List[Any]:
-    unique_vpc_ids = list(set(vpc_ids))
-    unique_group_names = set(group_names)
+def _get_security_group_from_vpc_id(ec2, vpc_id: str,
+                                    group_name: str) -> Optional[Any]:
+    """Get security group by VPC ID and group name."""
     existing_groups = list(
         ec2.security_groups.filter(Filters=[{
             'Name': 'vpc-id',
-            'Values': unique_vpc_ids
+            'Values': [vpc_id]
         }]))
-    filtered_groups = [
-        sg for sg in existing_groups if sg.group_name in unique_group_names
-    ]
-    return filtered_groups
+    for sg in existing_groups:
+        if sg.group_name == group_name:
+            return sg
+    return None

sky/provision/azure/instance.py CHANGED Viewed

@@ -343,7 +343,7 @@ def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
         _create_vm(compute_client, vm_name, node_tags, provider_config,
                    node_config, network_interface.id)
-    subprocess_utils.run_in_parallel(create_single_instance, range(count))
+    subprocess_utils.run_in_parallel(create_single_instance, list(range(count)))
     # Update disk performance tier
     performance_tier = node_config.get('disk_performance_tier', None)

sky/provision/do/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""DO provisioner for SkyPilot."""
+from sky.provision.do.config import bootstrap_instances
+from sky.provision.do.instance import cleanup_ports
+from sky.provision.do.instance import get_cluster_info
+from sky.provision.do.instance import open_ports
+from sky.provision.do.instance import query_instances
+from sky.provision.do.instance import run_instances
+from sky.provision.do.instance import stop_instances
+from sky.provision.do.instance import terminate_instances
+from sky.provision.do.instance import wait_instances

sky/provision/do/config.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Paperspace configuration bootstrapping."""
+from sky import sky_logging
+from sky.provision import common
+logger = sky_logging.init_logger(__name__)
+def bootstrap_instances(
+        region: str, cluster_name: str,
+        config: common.ProvisionConfig) -> common.ProvisionConfig:
+    """Bootstraps instances for the given cluster."""
+    del region, cluster_name
+    return config

sky/provision/do/constants.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""DO cloud constants
+"""
+POLL_INTERVAL = 5
+WAIT_DELETE_VOLUMES = 5
+GPU_IMAGES = {
+    'gpu-h100x1-80gb': 'gpu-h100x1-base',
+    'gpu-h100x8-640gb': 'gpu-h100x8-base',
+}

sky/provision/do/instance.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""DigitalOcean instance provisioning."""
+import time
+from typing import Any, Dict, List, Optional
+import uuid
+from sky import sky_logging
+from sky import status_lib
+from sky.provision import common
+from sky.provision.do import constants
+from sky.provision.do import utils
+# The maximum number of times to poll for the status of an operation
+MAX_POLLS = 60 // constants.POLL_INTERVAL
+# Stopping instances can take several minutes, so we increase the timeout
+MAX_POLLS_FOR_UP_OR_STOP = MAX_POLLS * 8
+logger = sky_logging.init_logger(__name__)
+def _get_head_instance(
+        instances: Dict[str, Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    for instance_name, instance_meta in instances.items():
+        if instance_name.endswith('-head'):
+            return instance_meta
+    return None
+def run_instances(region: str, cluster_name_on_cloud: str,
+                  config: common.ProvisionConfig) -> common.ProvisionRecord:
+    """Runs instances for the given cluster."""
+    pending_status = ['new']
+    newly_started_instances = utils.filter_instances(cluster_name_on_cloud,
+                                                     pending_status + ['off'])
+    while True:
+        instances = utils.filter_instances(cluster_name_on_cloud,
+                                           pending_status)
+        if not instances:
+            break
+        instance_statuses = [
+            instance['status'] for instance in instances.values()
+        ]
+        logger.info(f'Waiting for {len(instances)} instances to be ready: '
+                    f'{instance_statuses}')
+        time.sleep(constants.POLL_INTERVAL)
+    exist_instances = utils.filter_instances(cluster_name_on_cloud,
+                                             status_filters=pending_status +
+                                             ['active', 'off'])
+    if len(exist_instances) > config.count:
+        raise RuntimeError(
+            f'Cluster {cluster_name_on_cloud} already has '
+            f'{len(exist_instances)} nodes, but {config.count} are required.')
+    stopped_instances = utils.filter_instances(cluster_name_on_cloud,
+                                               status_filters=['off'])
+    for instance in stopped_instances.values():
+        utils.start_instance(instance)
+    for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
+        instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
+        if len(instances) == 0:
+            break
+        num_stopped_instances = len(stopped_instances)
+        num_restarted_instances = num_stopped_instances - len(instances)
+        logger.info(
+            f'Waiting for {num_restarted_instances}/{num_stopped_instances} '
+            'stopped instances to be restarted.')
+        time.sleep(constants.POLL_INTERVAL)
+    else:
+        msg = ('run_instances: Failed to restart all'
+               'instances possibly due to to capacity issue.')
+        logger.warning(msg)
+        raise RuntimeError(msg)
+    exist_instances = utils.filter_instances(cluster_name_on_cloud,
+                                             status_filters=['active'])
+    head_instance = _get_head_instance(exist_instances)
+    to_start_count = config.count - len(exist_instances)
+    if to_start_count < 0:
+        raise RuntimeError(
+            f'Cluster {cluster_name_on_cloud} already has '
+            f'{len(exist_instances)} nodes, but {config.count} are required.')
+    if to_start_count == 0:
+        if head_instance is None:
+            head_instance = list(exist_instances.values())[0]
+            utils.rename_instance(
+                head_instance,
+                f'{cluster_name_on_cloud}-{uuid.uuid4().hex[:4]}-head')
+        assert head_instance is not None, ('`head_instance` should not be None')
+        logger.info(f'Cluster {cluster_name_on_cloud} already has '
+                    f'{len(exist_instances)} nodes, no need to start more.')
+        return common.ProvisionRecord(
+            provider_name='do',
+            cluster_name=cluster_name_on_cloud,
+            region=region,
+            zone=None,
+            head_instance_id=head_instance['name'],
+            resumed_instance_ids=list(newly_started_instances.keys()),
+            created_instance_ids=[],
+        )
+    created_instances: List[Dict[str, Any]] = []
+    for _ in range(to_start_count):
+        instance_type = 'head' if head_instance is None else 'worker'
+        instance = utils.create_instance(
+            region=region,
+            cluster_name_on_cloud=cluster_name_on_cloud,
+            instance_type=instance_type,
+            config=config)
+        logger.info(f'Launched instance {instance["name"]}.')
+        created_instances.append(instance)
+        if head_instance is None:
+            head_instance = instance
+    # Wait for instances to be ready.
+    for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
+        instances = utils.filter_instances(cluster_name_on_cloud,
+                                           status_filters=['active'])
+        logger.info('Waiting for instances to be ready: '
+                    f'({len(instances)}/{config.count}).')
+        if len(instances) == config.count:
+            break
+        time.sleep(constants.POLL_INTERVAL)
+    else:
+        # Failed to launch config.count of instances after max retries
+        msg = 'run_instances: Failed to create the instances'
+        logger.warning(msg)
+        raise RuntimeError(msg)
+    assert head_instance is not None, 'head_instance should not be None'
+    return common.ProvisionRecord(
+        provider_name='do',
+        cluster_name=cluster_name_on_cloud,
+        region=region,
+        zone=None,
+        head_instance_id=head_instance['name'],
+        resumed_instance_ids=list(stopped_instances.keys()),
+        created_instance_ids=[
+            instance['name'] for instance in created_instances
+        ],
+    )
+def wait_instances(region: str, cluster_name_on_cloud: str,
+                   state: Optional[status_lib.ClusterStatus]) -> None:
+    del region, cluster_name_on_cloud, state  # unused
+    # We already wait on ready state in `run_instances` no need
+def stop_instances(
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    worker_only: bool = False,
+) -> None:
+    del provider_config  # unused
+    all_instances = utils.filter_instances(cluster_name_on_cloud,
+                                           status_filters=None)
+    num_instances = len(all_instances)
+    # Request a stop on all instances
+    for instance_name, instance_meta in all_instances.items():
+        if worker_only and instance_name.endswith('-head'):
+            num_instances -= 1
+            continue
+        utils.stop_instance(instance_meta)
+    # Wait for instances to stop
+    for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
+        all_instances = utils.filter_instances(cluster_name_on_cloud, ['off'])
+        if len(all_instances) >= num_instances:
+            break
+        time.sleep(constants.POLL_INTERVAL)
+    else:
+        raise RuntimeError(f'Maximum number of polls: '
+                           f'{MAX_POLLS_FOR_UP_OR_STOP} reached. '
+                           f'Instance {all_instances} is still not in '
+                           'STOPPED status.')
+def terminate_instances(
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    worker_only: bool = False,
+) -> None:
+    """See sky/provision/__init__.py"""
+    del provider_config  # unused
+    instances = utils.filter_instances(cluster_name_on_cloud,
+                                       status_filters=None)
+    for instance_name, instance_meta in instances.items():
+        logger.debug(f'Terminating instance {instance_name}')
+        if worker_only and instance_name.endswith('-head'):
+            continue
+        utils.down_instance(instance_meta)
+    for _ in range(MAX_POLLS_FOR_UP_OR_STOP):
+        instances = utils.filter_instances(cluster_name_on_cloud,
+                                           status_filters=None)
+        if len(instances) == 0 or len(instances) <= 1 and worker_only:
+            break
+        time.sleep(constants.POLL_INTERVAL)
+    else:
+        msg = ('Failed to delete all instances')
+        logger.warning(msg)
+        raise RuntimeError(msg)
+def get_cluster_info(
+    region: str,
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> common.ClusterInfo:
+    del region  # unused
+    running_instances = utils.filter_instances(cluster_name_on_cloud,
+                                               ['active'])
+    instances: Dict[str, List[common.InstanceInfo]] = {}
+    head_instance: Optional[str] = None
+    for instance_name, instance_meta in running_instances.items():
+        if instance_name.endswith('-head'):
+            head_instance = instance_name
+        for net in instance_meta['networks']['v4']:
+            if net['type'] == 'public':
+                instance_ip = net['ip_address']
+                break
+        instances[instance_name] = [
+            common.InstanceInfo(
+                instance_id=instance_meta['name'],
+                internal_ip=instance_ip,
+                external_ip=instance_ip,
+                ssh_port=22,
+                tags={},
+            )
+        ]
+    assert head_instance is not None, 'no head instance found'
+    return common.ClusterInfo(
+        instances=instances,
+        head_instance_id=head_instance,
+        provider_name='do',
+        provider_config=provider_config,
+    )
+def query_instances(
+    cluster_name_on_cloud: str,
+    provider_config: Optional[Dict[str, Any]] = None,
+    non_terminated_only: bool = True,
+) -> Dict[str, Optional[status_lib.ClusterStatus]]:
+    """See sky/provision/__init__.py"""
+    # terminated instances are not retrieved by the
+    # API making `non_terminated_only` argument moot.
+    del non_terminated_only
+    assert provider_config is not None, (cluster_name_on_cloud, provider_config)
+    instances = utils.filter_instances(cluster_name_on_cloud,
+                                       status_filters=None)
+    status_map = {
+        'new': status_lib.ClusterStatus.INIT,
+        'archive': status_lib.ClusterStatus.INIT,
+        'active': status_lib.ClusterStatus.UP,
+        'off': status_lib.ClusterStatus.STOPPED,
+    }
+    statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
+    for instance_meta in instances.values():
+        status = status_map[instance_meta['status']]
+        statuses[instance_meta['name']] = status
+    return statuses
+def open_ports(
+    cluster_name_on_cloud: str,
+    ports: List[str],
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> None:
+    """See sky/provision/__init__.py"""
+    logger.debug(
+        f'Skip opening ports {ports} for DigitalOcean instances, as all '
+        'ports are open by default.')
+    del cluster_name_on_cloud, provider_config, ports
+def cleanup_ports(
+    cluster_name_on_cloud: str,
+    ports: List[str],
+    provider_config: Optional[Dict[str, Any]] = None,
+) -> None:
+    del cluster_name_on_cloud, provider_config, ports

skypilot-nightly 1.0.0.dev20241227__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

skypilot-nightly 1.0.0.dev20241227py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl