PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250114py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

sky/__init__.py +2 -2
sky/backends/cloud_vm_ray_backend.py +50 -67
sky/check.py +31 -1
sky/cli.py +11 -34
sky/clouds/kubernetes.py +3 -3
sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
sky/core.py +8 -5
sky/data/storage.py +66 -14
sky/global_user_state.py +1 -1
sky/jobs/constants.py +8 -7
sky/jobs/controller.py +19 -22
sky/jobs/core.py +0 -2
sky/jobs/recovery_strategy.py +114 -143
sky/jobs/scheduler.py +283 -0
sky/jobs/state.py +263 -21
sky/jobs/utils.py +338 -96
sky/provision/aws/config.py +48 -26
sky/provision/gcp/instance_utils.py +15 -9
sky/provision/kubernetes/instance.py +1 -1
sky/provision/kubernetes/utils.py +76 -18
sky/resources.py +1 -1
sky/serve/autoscalers.py +359 -301
sky/serve/controller.py +10 -8
sky/serve/core.py +84 -7
sky/serve/load_balancer.py +27 -10
sky/serve/replica_managers.py +1 -3
sky/serve/serve_state.py +10 -5
sky/serve/serve_utils.py +28 -1
sky/serve/service.py +4 -3
sky/serve/service_spec.py +31 -0
sky/skylet/constants.py +1 -1
sky/skylet/events.py +7 -3
sky/skylet/job_lib.py +10 -30
sky/skylet/log_lib.py +8 -8
sky/skylet/log_lib.pyi +3 -0
sky/skylet/skylet.py +1 -1
sky/templates/jobs-controller.yaml.j2 +7 -3
sky/templates/sky-serve-controller.yaml.j2 +4 -0
sky/utils/db_utils.py +18 -4
sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
sky/utils/resources_utils.py +25 -21
sky/utils/schemas.py +13 -0
sky/utils/subprocess_utils.py +48 -9
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0

sky/provision/aws/config.py CHANGED Viewed

@@ -553,17 +553,28 @@ def _configure_security_group(ec2, vpc_id: str, expected_sg_name: str,
 def _get_or_create_vpc_security_group(ec2, vpc_id: str,
                                       expected_sg_name: str) -> Any:
-    # Figure out which security groups with this name exist for each VPC...
-    vpc_to_existing_sg = {
-        sg.vpc_id: sg for sg in _get_security_groups_from_vpc_ids(
-            ec2,
-            [vpc_id],
-            [expected_sg_name],
-        )
-    }
+    """Find or create a security group in the specified VPC.
-    if vpc_id in vpc_to_existing_sg:
-        return vpc_to_existing_sg[vpc_id]
+    Args:
+        ec2: The initialized EC2 client object.
+        vpc_id: The ID of the VPC where the security group should be queried
+            or created.
+        expected_sg_name: The expected name of the security group.
+    Returns:
+        The security group object containing the details of the security group.
+    Raises:
+        exceptions.NoClusterLaunchedError: If the security group creation fails
+            and is not due to an existing duplicate.
+        botocore.exceptions.ClientError: If the security group creation fails
+            due to AWS service issues.
+    """
+    # Figure out which security groups with this name exist for each VPC...
+    security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
+                                                     expected_sg_name)
+    if security_group is not None:
+        return security_group
     try:
         # create a new security group
@@ -573,34 +584,45 @@ def _get_or_create_vpc_security_group(ec2, vpc_id: str,
             VpcId=vpc_id,
         )
     except ec2.meta.client.exceptions.ClientError as e:
+        if e.response['Error']['Code'] == 'InvalidGroup.Duplicate':
+            # The security group already exists, but we didn't see it
+            # because of eventual consistency.
+            logger.warning(f'{expected_sg_name} already exists when creating.')
+            security_group = _get_security_group_from_vpc_id(
+                ec2, vpc_id, expected_sg_name)
+            assert (security_group is not None and
+                    security_group.group_name == expected_sg_name), (
+                        f'Expected {expected_sg_name} but got {security_group}')
+            logger.info(
+                f'Found existing security group {colorama.Style.BRIGHT}'
+                f'{security_group.group_name}{colorama.Style.RESET_ALL} '
+                f'[id={security_group.id}]')
+            return security_group
         message = ('Failed to create security group. Error: '
                    f'{common_utils.format_exception(e)}')
         logger.warning(message)
         raise exceptions.NoClusterLaunchedError(message) from e
-    security_group = _get_security_groups_from_vpc_ids(ec2, [vpc_id],
-                                                       [expected_sg_name])
-    assert security_group, 'Failed to create security group'
-    security_group = security_group[0]
+    security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
+                                                     expected_sg_name)
+    assert security_group is not None, 'Failed to create security group'
     logger.info(f'Created new security group {colorama.Style.BRIGHT}'
                 f'{security_group.group_name}{colorama.Style.RESET_ALL} '
                 f'[id={security_group.id}]')
     return security_group
-def _get_security_groups_from_vpc_ids(ec2, vpc_ids: List[str],
-                                      group_names: List[str]) -> List[Any]:
-    unique_vpc_ids = list(set(vpc_ids))
-    unique_group_names = set(group_names)
+def _get_security_group_from_vpc_id(ec2, vpc_id: str,
+                                    group_name: str) -> Optional[Any]:
+    """Get security group by VPC ID and group name."""
     existing_groups = list(
         ec2.security_groups.filter(Filters=[{
             'Name': 'vpc-id',
-            'Values': unique_vpc_ids
+            'Values': [vpc_id]
         }]))
-    filtered_groups = [
-        sg for sg in existing_groups if sg.group_name in unique_group_names
-    ]
-    return filtered_groups
+    for sg in existing_groups:
+        if sg.group_name == group_name:
+            return sg
+    return None

sky/provision/gcp/instance_utils.py CHANGED Viewed

@@ -38,7 +38,7 @@ _FIREWALL_RESOURCE_NOT_FOUND_PATTERN = re.compile(
     r'The resource \'projects/.*/global/firewalls/.*\' was not found')
-def _retry_on_http_exception(
+def _retry_on_gcp_http_exception(
     regex: Optional[str] = None,
     max_retries: int = GCP_MAX_RETRIES,
     retry_interval_s: int = GCP_RETRY_INTERVAL_SECONDS,
@@ -49,17 +49,18 @@ def _retry_on_http_exception(
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
-            exception_type = gcp.http_error_exception()
             def try_catch_exc():
                 try:
                     value = func(*args, **kwargs)
                     return value
                 except Exception as e:  # pylint: disable=broad-except
-                    if not isinstance(e, exception_type) or (
-                            regex and not re.search(regex, str(e))):
-                        raise
-                    return e
+                    if (isinstance(e, gcp.http_error_exception()) and
+                        (regex is None or re.search(regex, str(e)))):
+                        logger.error(
+                            f'Retrying for gcp.http_error_exception: {e}')
+                        return e
+                    raise
             for _ in range(max_retries):
                 ret = try_catch_exc()
@@ -431,7 +432,7 @@ class GCPComputeInstance(GCPInstance):
         logger.debug(
             f'Waiting GCP operation {operation["name"]} to be ready ...')
-        @_retry_on_http_exception(
+        @_retry_on_gcp_http_exception(
             f'Failed to wait for operation {operation["name"]}')
         def call_operation(fn, timeout: int):
             request = fn(
@@ -613,6 +614,11 @@ class GCPComputeInstance(GCPInstance):
         return operation
     @classmethod
+    # When there is a cloud function running in parallel to set labels for
+    # newly created instances, it may fail with the following error:
+    #   "Labels fingerprint either invalid or resource labels have changed"
+    # We should retry until the labels are set successfully.
+    @_retry_on_gcp_http_exception('Labels fingerprint either invalid')
     def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
                    labels: dict) -> None:
         node = cls.load_resource().instances().get(
@@ -1211,7 +1217,7 @@ class GCPTPUVMInstance(GCPInstance):
         """Poll for TPU operation until finished."""
         del project_id, region, zone  # unused
-        @_retry_on_http_exception(
+        @_retry_on_gcp_http_exception(
             f'Failed to wait for operation {operation["name"]}')
         def call_operation(fn, timeout: int):
             request = fn(name=operation['name'])
@@ -1379,7 +1385,7 @@ class GCPTPUVMInstance(GCPInstance):
                     f'Failed to get VPC name for instance {instance}') from e
     @classmethod
-    @_retry_on_http_exception('unable to queue the operation')
+    @_retry_on_gcp_http_exception('unable to queue the operation')
     def set_labels(cls, project_id: str, availability_zone: str, node_id: str,
                    labels: dict) -> None:
         while True:

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -976,7 +976,7 @@ def terminate_instances(
         _terminate_node(namespace, context, pod_name)
     # Run pod termination in parallel
-    subprocess_utils.run_in_parallel(_terminate_pod_thread, pods.items(),
+    subprocess_utils.run_in_parallel(_terminate_pod_thread, list(pods.items()),
                                      _NUM_THREADS)

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ import os
 import re
 import shutil
 import subprocess
+import time
 import typing
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 from urllib.parse import urlparse
@@ -105,6 +106,75 @@ ANNOTATIONS_POD_NOT_FOUND_ERROR_MSG = ('Pod {pod_name} not found in namespace '
 logger = sky_logging.init_logger(__name__)
+# Default retry settings for Kubernetes API calls
+DEFAULT_MAX_RETRIES = 3
+DEFAULT_RETRY_INTERVAL_SECONDS = 1
+def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
+                    retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
+                    resource_type: Optional[str] = None):
+    """Decorator to retry Kubernetes API calls on transient failures.
+    Args:
+        max_retries: Maximum number of retry attempts
+        retry_interval: Initial seconds to wait between retries
+        resource_type: Type of resource being accessed (e.g. 'node', 'pod').
+            Used to provide more specific error messages.
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            last_exception = None
+            backoff = common_utils.Backoff(initial_backoff=retry_interval,
+                                           max_backoff_factor=3)
+            for attempt in range(max_retries):
+                try:
+                    return func(*args, **kwargs)
+                except (kubernetes.max_retry_error(),
+                        kubernetes.api_exception(),
+                        kubernetes.config_exception()) as e:
+                    last_exception = e
+                    # Don't retry on permanent errors like 401 (Unauthorized)
+                    # or 403 (Forbidden)
+                    if (isinstance(e, kubernetes.api_exception()) and
+                            e.status in (401, 403)):
+                        raise
+                    if attempt < max_retries - 1:
+                        sleep_time = backoff.current_backoff()
+                        logger.debug(f'Kubernetes API call {func.__name__} '
+                                     f'failed with {str(e)}. Retrying in '
+                                     f'{sleep_time:.1f}s...')
+                        time.sleep(sleep_time)
+                        continue
+            # Format error message based on the type of exception
+            resource_msg = f' when trying to get {resource_type} info' \
+                if resource_type else ''
+            debug_cmd = f' To debug, run: kubectl get {resource_type}s' \
+                if resource_type else ''
+            if isinstance(last_exception, kubernetes.max_retry_error()):
+                error_msg = f'Timed out{resource_msg} from Kubernetes cluster.'
+            elif isinstance(last_exception, kubernetes.api_exception()):
+                error_msg = (f'Kubernetes API error{resource_msg}: '
+                             f'{str(last_exception)}')
+            else:
+                error_msg = (f'Kubernetes configuration error{resource_msg}: '
+                             f'{str(last_exception)}')
+            raise exceptions.ResourcesUnavailableError(
+                f'{error_msg}'
+                f' Please check if the cluster is healthy and retry.'
+                f'{debug_cmd}') from last_exception
+        return wrapper
+    return decorator
 class GPULabelFormatter:
     """Base class to define a GPU label formatter for a Kubernetes cluster
@@ -446,6 +516,7 @@ def detect_accelerator_resource(
 @functools.lru_cache(maxsize=10)
+@_retry_on_error(resource_type='node')
 def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
     """Gets the kubernetes nodes in the context.
@@ -454,17 +525,12 @@ def get_kubernetes_nodes(context: Optional[str] = None) -> List[Any]:
     if context is None:
         context = get_current_kube_config_context_name()
-    try:
-        nodes = kubernetes.core_api(context).list_node(
-            _request_timeout=kubernetes.API_TIMEOUT).items
-    except kubernetes.max_retry_error():
-        raise exceptions.ResourcesUnavailableError(
-            'Timed out when trying to get node info from Kubernetes cluster. '
-            'Please check if the cluster is healthy and retry. To debug, run: '
-            'kubectl get nodes') from None
+    nodes = kubernetes.core_api(context).list_node(
+        _request_timeout=kubernetes.API_TIMEOUT).items
     return nodes
+@_retry_on_error(resource_type='pod')
 def get_all_pods_in_kubernetes_cluster(
         context: Optional[str] = None) -> List[Any]:
     """Gets pods in all namespaces in kubernetes cluster indicated by context.
@@ -474,14 +540,8 @@ def get_all_pods_in_kubernetes_cluster(
     if context is None:
         context = get_current_kube_config_context_name()
-    try:
-        pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
-            _request_timeout=kubernetes.API_TIMEOUT).items
-    except kubernetes.max_retry_error():
-        raise exceptions.ResourcesUnavailableError(
-            'Timed out when trying to get pod info from Kubernetes cluster. '
-            'Please check if the cluster is healthy and retry. To debug, run: '
-            'kubectl get pods') from None
+    pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
+        _request_timeout=kubernetes.API_TIMEOUT).items
     return pods
@@ -1758,8 +1818,6 @@ def merge_dicts(source: Dict[Any, Any], destination: Dict[Any, Any]):
             else:
                 destination[key].extend(value)
         else:
-            if destination is None:
-                destination = {}
             destination[key] = value

sky/resources.py CHANGED Viewed

@@ -540,7 +540,7 @@ class Resources:
         if memory_gb <= 0:
             with ux_utils.print_exception_no_traceback():
                 raise ValueError(
-                    f'The "cpus" field should be positive. Found: {memory!r}')
+                    f'The "memory" field should be positive. Found: {memory!r}')
     def _set_accelerators(
         self,

skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250114py3-none-any.whl → 1.0.0.dev20250124py3-none-any.whl