PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250818__py3-none-any.whl → 1.0.0.dev20250820__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250818py3-none-any.whl → 1.0.0.dev20250820py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (73) hide show

sky/__init__.py +5 -3
sky/backends/cloud_vm_ray_backend.py +6 -13
sky/backends/wheel_utils.py +2 -1
sky/catalog/data_fetchers/fetch_aws.py +2 -0
sky/client/cli/command.py +20 -16
sky/core.py +1 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_buildManifest.js +1 -1
sky/dashboard/out/_next/static/chunks/3015-bf218e4973bf5c8f.js +1 -0
sky/dashboard/out/_next/static/chunks/{8969-6cb1af4ec7fb1e19.js → 8969-23c8fbdb8b397d59.js} +1 -1
sky/dashboard/out/_next/static/chunks/{webpack-a46c8b62df807ec1.js → webpack-008593a02784a2df.js} +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/exceptions.py +6 -1
sky/global_user_state.py +18 -11
sky/jobs/constants.py +1 -1
sky/jobs/server/core.py +43 -34
sky/jobs/server/utils.py +2 -1
sky/jobs/utils.py +56 -9
sky/models.py +1 -0
sky/provision/aws/config.py +11 -11
sky/provision/aws/instance.py +30 -27
sky/provision/do/utils.py +2 -2
sky/provision/kubernetes/network_utils.py +3 -3
sky/provision/kubernetes/utils.py +2 -2
sky/provision/kubernetes/volume.py +2 -0
sky/provision/provisioner.py +10 -6
sky/serve/replica_managers.py +7 -0
sky/serve/server/impl.py +1 -1
sky/server/requests/payloads.py +2 -0
sky/server/requests/serializers/encoders.py +29 -5
sky/server/server.py +37 -1
sky/setup_files/MANIFEST.in +1 -0
sky/setup_files/dependencies.py +17 -11
sky/skylet/ray_patches/__init__.py +18 -4
sky/skylet/ray_patches/autoscaler.py.diff +18 -0
sky/skylet/ray_patches/cli.py.diff +19 -0
sky/skylet/ray_patches/command_runner.py.diff +17 -0
sky/skylet/ray_patches/log_monitor.py.diff +20 -0
sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
sky/skylet/ray_patches/updater.py.diff +18 -0
sky/skylet/ray_patches/worker.py.diff +41 -0
sky/utils/common.py +27 -7
sky/utils/common_utils.py +13 -9
sky/utils/directory_utils.py +12 -0
sky/utils/env_options.py +3 -0
sky/utils/kubernetes/gpu_labeler.py +3 -3
sky/utils/schemas.py +1 -0
sky/utils/serialize_utils.py +16 -0
sky/volumes/client/sdk.py +10 -7
sky/volumes/server/core.py +12 -3
sky/volumes/volume.py +17 -3
{skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/METADATA +21 -13
{skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/RECORD +72 -63
sky/dashboard/out/_next/static/chunks/3015-471d67c9302d4027.js +0 -1
/sky/dashboard/out/_next/static/{D7_ocVBIBwyxtvXYWggqV → 8ZscIHnvBWz3AXkxsJL6H}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250818.dist-info → skypilot_nightly-1.0.0.dev20250820.dist-info}/top_level.txt +0 -0

sky/provision/aws/config.py CHANGED Viewed

@@ -498,8 +498,8 @@ def _vpc_id_from_security_group_ids(ec2: 'mypy_boto3_ec2.ServiceResource',
     return vpc_ids[0]
-def _get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
-                        region: str) -> str:
+def get_vpc_id_by_name(ec2: 'mypy_boto3_ec2.ServiceResource', vpc_name: str,
+                       region: str) -> str:
     """Returns the VPC ID of the unique VPC with a given name.
     Exits with code 1 if:
@@ -532,7 +532,7 @@ def _get_subnet_and_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
                            use_internal_ips: bool,
                            vpc_name: Optional[str]) -> Tuple[Any, str]:
     if vpc_name is not None:
-        vpc_id_of_sg = _get_vpc_id_by_name(ec2, vpc_name, region)
+        vpc_id_of_sg = get_vpc_id_by_name(ec2, vpc_name, region)
     elif security_group_ids:
         vpc_id_of_sg = _vpc_id_from_security_group_ids(ec2, security_group_ids)
     else:
@@ -614,8 +614,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
             due to AWS service issues.
     """
     # Figure out which security groups with this name exist for each VPC...
-    security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
-                                                     expected_sg_name)
+    security_group = get_security_group_from_vpc_id(ec2, vpc_id,
+                                                    expected_sg_name)
     if security_group is not None:
         return security_group
@@ -631,7 +631,7 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
             # The security group already exists, but we didn't see it
             # because of eventual consistency.
             logger.warning(f'{expected_sg_name} already exists when creating.')
-            security_group = _get_security_group_from_vpc_id(
+            security_group = get_security_group_from_vpc_id(
                 ec2, vpc_id, expected_sg_name)
             assert (security_group is not None and
                     security_group.group_name == expected_sg_name), (
@@ -646,8 +646,8 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
         logger.warning(message)
         raise exceptions.NoClusterLaunchedError(message) from e
-    security_group = _get_security_group_from_vpc_id(ec2, vpc_id,
-                                                     expected_sg_name)
+    security_group = get_security_group_from_vpc_id(ec2, vpc_id,
+                                                    expected_sg_name)
     assert security_group is not None, 'Failed to create security group'
     logger.info(f'Created new security group {colorama.Style.BRIGHT}'
                 f'{security_group.group_name}{colorama.Style.RESET_ALL} '
@@ -655,9 +655,9 @@ def _get_or_create_vpc_security_group(ec2: 'mypy_boto3_ec2.ServiceResource',
     return security_group
-def _get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
-                                    vpc_id: str,
-                                    group_name: str) -> Optional[Any]:
+def get_security_group_from_vpc_id(ec2: 'mypy_boto3_ec2.ServiceResource',
+                                   vpc_id: str,
+                                   group_name: str) -> Optional[Any]:
     """Get security group by VPC ID and group name."""
     existing_groups = list(
         ec2.security_groups.filter(Filters=[{

sky/provision/aws/instance.py CHANGED Viewed

@@ -18,6 +18,7 @@ from sky.clouds import aws as aws_cloud
 from sky.clouds.utils import aws_utils
 from sky.provision import common
 from sky.provision import constants
+from sky.provision.aws import config as aws_config
 from sky.provision.aws import utils
 from sky.utils import common_utils
 from sky.utils import resources_utils
@@ -685,7 +686,9 @@ def terminate_instances(
                                   filters,
                                   included_instances=None,
                                   excluded_instances=None)
-    default_sg = _get_sg_from_name(ec2, aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
+    default_sg = aws_config.get_security_group_from_vpc_id(
+        ec2, _get_vpc_id(provider_config),
+        aws_cloud.DEFAULT_SECURITY_GROUP_NAME)
     if sg_name == aws_cloud.DEFAULT_SECURITY_GROUP_NAME:
         # Case 1: The default SG is used, we don't need to ensure instance are
         # terminated.
@@ -727,30 +730,6 @@ def terminate_instances(
     #  of most cloud implementations (including AWS).
-def _get_sg_from_name(
-    ec2: Any,
-    sg_name: str,
-) -> Any:
-    # GroupNames will only filter SGs in the default VPC, so we need to use
-    # Filters here. Ref:
-    # https://boto3.amazonaws.com/v1/documentation/api/1.26.112/reference/services/ec2/service-resource/security_groups.html  # pylint: disable=line-too-long
-    sgs = ec2.security_groups.filter(Filters=[{
-        'Name': 'group-name',
-        'Values': [sg_name]
-    }])
-    num_sg = len(list(sgs))
-    if num_sg == 0:
-        logger.warning(f'Expected security group {sg_name} not found. ')
-        return None
-    if num_sg > 1:
-        # TODO(tian): Better handle this case. Maybe we can check when creating
-        # the SG and throw an error if there is already an existing SG with the
-        # same name.
-        logger.warning(f'Found {num_sg} security groups with name {sg_name}. ')
-        return None
-    return list(sgs)[0]
 def _maybe_move_to_new_sg(
     instance: Any,
     expected_sg: Any,
@@ -803,7 +782,9 @@ def open_ports(
         with ux_utils.print_exception_no_traceback():
             raise ValueError('Instance with cluster name '
                              f'{cluster_name_on_cloud} not found.')
-    sg = _get_sg_from_name(ec2, sg_name)
+    sg = aws_config.get_security_group_from_vpc_id(ec2,
+                                                   _get_vpc_id(provider_config),
+                                                   sg_name)
     if sg is None:
         with ux_utils.print_exception_no_traceback():
             raise ValueError('Cannot find new security group '
@@ -899,7 +880,9 @@ def cleanup_ports(
         # We only want to delete the SG that is dedicated to this cluster (i.e.,
         # this cluster have opened some ports).
         return
-    sg = _get_sg_from_name(ec2, sg_name)
+    sg = aws_config.get_security_group_from_vpc_id(ec2,
+                                                   _get_vpc_id(provider_config),
+                                                   sg_name)
     if sg is None:
         logger.warning(
             'Find security group failed. Skip cleanup security group.')
@@ -1010,3 +993,23 @@ def get_cluster_info(
         provider_name='aws',
         provider_config=provider_config,
     )
+def _get_vpc_id(provider_config: Dict[str, Any]) -> str:
+    region = provider_config['region']
+    ec2 = _default_ec2_resource(provider_config['region'])
+    if 'vpc_name' in provider_config:
+        return aws_config.get_vpc_id_by_name(ec2, provider_config['vpc_name'],
+                                             region)
+    else:
+        # Retrieve the default VPC name from the region.
+        response = ec2.meta.client.describe_vpcs(Filters=[{
+            'Name': 'isDefault',
+            'Values': ['true']
+        }])
+        if len(response['Vpcs']) == 0:
+            raise ValueError(f'No default VPC found in region {region}')
+        elif len(response['Vpcs']) > 1:
+            raise ValueError(f'Multiple default VPCs found in region {region}')
+        else:
+            return response['Vpcs'][0]['VpcId']

sky/provision/do/utils.py CHANGED Viewed

@@ -30,7 +30,7 @@ POSSIBLE_CREDENTIALS_PATHS = [
 INITIAL_BACKOFF_SECONDS = 10
 MAX_BACKOFF_FACTOR = 10
 MAX_ATTEMPTS = 6
-SSH_KEY_NAME_ON_DO = f'sky-key-{common_utils.get_user_hash()}'
+SSH_KEY_NAME_ON_DO_PREFIX = 'sky-key-'
 _client = None
 _ssh_key_id = None
@@ -125,7 +125,7 @@ def ssh_key_id(public_key: str):
         request = {
             'public_key': public_key,
-            'name': SSH_KEY_NAME_ON_DO,
+            'name': SSH_KEY_NAME_ON_DO_PREFIX + common_utils.get_user_hash(),
         }
         _ssh_key_id = client().ssh_keys.create(body=request)['ssh_key']
     return _ssh_key_id

sky/provision/kubernetes/network_utils.py CHANGED Viewed

@@ -4,13 +4,13 @@ import time
 import typing
 from typing import Dict, List, Optional, Tuple, Union
-import sky
 from sky import exceptions
 from sky import sky_logging
 from sky import skypilot_config
 from sky.adaptors import common as adaptors_common
 from sky.adaptors import kubernetes
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.utils import directory_utils
 from sky.utils import kubernetes_enums
 from sky.utils import ux_utils
@@ -80,7 +80,7 @@ def get_networking_mode(
 def fill_loadbalancer_template(namespace: str, context: Optional[str],
                                service_name: str, ports: List[int],
                                selector_key: str, selector_value: str) -> Dict:
-    template_path = os.path.join(sky.__root_dir__, 'templates',
+    template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
                                  _LOADBALANCER_TEMPLATE_NAME)
     if not os.path.exists(template_path):
         raise FileNotFoundError(
@@ -116,7 +116,7 @@ def fill_ingress_template(namespace: str, context: Optional[str],
                           service_details: List[Tuple[str, int,
                                                       str]], ingress_name: str,
                           selector_key: str, selector_value: str) -> Dict:
-    template_path = os.path.join(sky.__root_dir__, 'templates',
+    template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
                                  _INGRESS_TEMPLATE_NAME)
     if not os.path.exists(template_path):
         raise FileNotFoundError(

sky/provision/kubernetes/utils.py CHANGED Viewed

@@ -14,7 +14,6 @@ import typing
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
 from urllib.parse import urlparse
-import sky
 from sky import clouds
 from sky import exceptions
 from sky import global_user_state
@@ -31,6 +30,7 @@ from sky.skylet import constants
 from sky.utils import annotations
 from sky.utils import common_utils
 from sky.utils import config_utils
+from sky.utils import directory_utils
 from sky.utils import env_options
 from sky.utils import kubernetes_enums
 from sky.utils import schemas
@@ -2444,7 +2444,7 @@ def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
 def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
                            ssh_jump_name: str, service_type: str) -> Dict:
-    template_path = os.path.join(sky.__root_dir__, 'templates',
+    template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
                                  'kubernetes-ssh-jump.yml.j2')
     if not os.path.exists(template_path):
         raise FileNotFoundError(

sky/provision/kubernetes/volume.py CHANGED Viewed

@@ -203,6 +203,8 @@ def _get_pvc_spec(namespace: str,
             },
         }
     }
+    if config.labels:
+        pvc_spec['metadata']['labels'].update(config.labels)
     storage_class = config.config.get('storage_class_name')
     if storage_class is not None:
         pvc_spec['spec']['storageClassName'] = storage_class

sky/provision/provisioner.py CHANGED Viewed

@@ -167,7 +167,7 @@ def bulk_provision(
             # This error is a user error instead of a provisioning failure.
             # And there is no possibility to fix it by teardown.
             raise
-        except Exception:  # pylint: disable=broad-except
+        except Exception as exc:  # pylint: disable=broad-except
             zone_str = 'all zones'
             if zones:
                 zone_str = ','.join(zone.name for zone in zones)
@@ -189,14 +189,18 @@ def bulk_provision(
                         provider_config=original_config['provider'])
                     break
                 except NotImplementedError as e:
-                    verb = 'terminate' if terminate else 'stop'
+                    assert not terminate, (
+                        'Terminating must be supported by all clouds')
+                    exc_msg = common_utils.format_exception(exc).replace(
+                        '\n', ' ')
                     # If the underlying cloud does not support stopping
                     # instances, we should stop failover as well.
                     raise provision_common.StopFailoverError(
-                        'During provisioner\'s failover, '
-                        f'{terminate_str.lower()} {cluster_name!r} failed. '
-                        f'We cannot {verb} the resources launched, as it is '
-                        f'not supported by {cloud}. Please try launching the '
+                        f'Provisioning cluster {cluster_name.display_name} '
+                        f'failed: {exc_msg}. Failover is stopped for safety '
+                        'because the cluster was previously in UP state but '
+                        f'{cloud} does not support stopping instances to '
+                        'preserve the cluster state. Please try launching the '
                         'cluster again, or terminate it with: '
                         f'sky down {cluster_name.display_name}') from e
                 except Exception as e:  # pylint: disable=broad-except

sky/serve/replica_managers.py CHANGED Viewed

@@ -48,6 +48,13 @@ _PROCESS_POOL_REFRESH_INTERVAL = 20
 _RETRY_INIT_GAP_SECONDS = 60
 _DEFAULT_DRAIN_SECONDS = 120
+# TODO(tian): Backward compatibility. Remove this after 3 minor release, i.e.
+# 0.13.0. We move the ProcessStatus to common_utils.ProcessStatus in #6666, but
+# old ReplicaInfo in database will still tries to unpickle using ProcessStatus
+# in replica_managers. We set this alias to avoid breaking changes. See #6729
+# for more details.
+ProcessStatus = common_utils.ProcessStatus
 # TODO(tian): Combine this with
 # sky/spot/recovery_strategy.py::StrategyExecutor::launch

sky/serve/server/impl.py CHANGED Viewed

@@ -129,11 +129,11 @@ def up(
                              f'{constants.CLUSTER_NAME_VALID_REGEX}')
     dag = dag_utils.convert_entrypoint_to_dag(task)
-    dag.resolve_and_validate_volumes()
     # Always apply the policy again here, even though it might have been applied
     # in the CLI. This is to ensure that we apply the policy to the final DAG
     # and get the mutated config.
     dag, mutated_user_config = admin_policy_utils.apply(dag)
+    dag.resolve_and_validate_volumes()
     dag.pre_mount_volumes()
     task = dag.tasks[0]
     assert task.service is not None

sky/server/requests/payloads.py CHANGED Viewed

@@ -453,6 +453,7 @@ class VolumeApplyBody(RequestBody):
     zone: Optional[str] = None
     size: Optional[str] = None
     config: Optional[Dict[str, Any]] = None
+    labels: Optional[Dict[str, str]] = None
 class VolumeDeleteBody(RequestBody):
@@ -503,6 +504,7 @@ class JobsQueueBody(RequestBody):
     pool_match: Optional[str] = None
     page: Optional[int] = None
     limit: Optional[int] = None
+    statuses: Optional[List[str]] = None
 class JobsCancelBody(RequestBody):

sky/server/requests/serializers/encoders.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple
 from sky.schemas.api import responses
 from sky.server import constants as server_constants
+from sky.utils import serialize_utils
 if typing.TYPE_CHECKING:
     from sky import backends
@@ -22,6 +23,9 @@ handlers: Dict[str, Any] = {}
 def pickle_and_encode(obj: Any) -> str:
     try:
+        # Apply backwards compatibility processing at the lowest level
+        # to catch any handles that might have bypassed the encoders
+        obj = serialize_utils.prepare_handle_for_backwards_compatibility(obj)
         return base64.b64encode(pickle.dumps(obj)).decode('utf-8')
     except TypeError as e:
         raise ValueError(f'Failed to pickle object: {obj}') from e
@@ -58,7 +62,9 @@ def encode_status(
     for cluster in clusters:
         response_cluster = cluster.model_dump()
         response_cluster['status'] = cluster['status'].value
-        response_cluster['handle'] = pickle_and_encode(cluster['handle'])
+        handle = serialize_utils.prepare_handle_for_backwards_compatibility(
+            cluster['handle'])
+        response_cluster['handle'] = pickle_and_encode(handle)
         response_cluster['storage_mounts_metadata'] = pickle_and_encode(
             response_cluster['storage_mounts_metadata'])
         response.append(response_cluster)
@@ -70,6 +76,7 @@ def encode_launch(
     job_id_handle: Tuple[Optional[int], Optional['backends.ResourceHandle']]
 ) -> Dict[str, Any]:
     job_id, handle = job_id_handle
+    handle = serialize_utils.prepare_handle_for_backwards_compatibility(handle)
     return {
         'job_id': job_id,
         'handle': pickle_and_encode(handle),
@@ -78,6 +85,9 @@ def encode_launch(
 @register_encoder('start')
 def encode_start(resource_handle: 'backends.CloudVmRayResourceHandle') -> str:
+    resource_handle = (
+        serialize_utils.prepare_handle_for_backwards_compatibility(
+            resource_handle))
     return pickle_and_encode(resource_handle)
@@ -113,8 +123,15 @@ def encode_status_kubernetes(
 @register_encoder('jobs.queue')
 def encode_jobs_queue(jobs_or_tuple):
     # Support returning either a plain jobs list or a (jobs, total) tuple
-    if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
-        jobs, total = jobs_or_tuple
+    status_counts = {}
+    if isinstance(jobs_or_tuple, tuple):
+        if len(jobs_or_tuple) == 2:
+            jobs, total = jobs_or_tuple
+            total_no_filter = total
+        elif len(jobs_or_tuple) == 4:
+            jobs, total, status_counts, total_no_filter = jobs_or_tuple
+        else:
+            raise ValueError(f'Invalid jobs tuple: {jobs_or_tuple}')
     else:
         jobs = jobs_or_tuple
         total = None
@@ -122,7 +139,12 @@ def encode_jobs_queue(jobs_or_tuple):
         job['status'] = job['status'].value
     if total is None:
         return jobs
-    return {'jobs': jobs, 'total': total}
+    return {
+        'jobs': jobs,
+        'total': total,
+        'total_no_filter': total_no_filter,
+        'status_counts': status_counts
+    }
 def _encode_serve_status(
@@ -131,7 +153,9 @@ def _encode_serve_status(
         service_status['status'] = service_status['status'].value
         for replica_info in service_status.get('replica_info', []):
             replica_info['status'] = replica_info['status'].value
-            replica_info['handle'] = pickle_and_encode(replica_info['handle'])
+            handle = serialize_utils.prepare_handle_for_backwards_compatibility(
+                replica_info['handle'])
+            replica_info['handle'] = pickle_and_encode(handle)
     return service_statuses

sky/server/server.py CHANGED Viewed

@@ -83,6 +83,8 @@ else:
 P = ParamSpec('P')
+_SERVER_USER_HASH_KEY = 'server_user_hash'
 def _add_timestamp_prefix_for_server_logs() -> None:
     server_logger = sky_logging.init_logger('sky.server')
@@ -1650,7 +1652,10 @@ async def kubernetes_pod_ssh_proxy(websocket: fastapi.WebSocket,
     await websocket.accept()
     logger.info(f'WebSocket connection accepted for cluster: {cluster_name}')
-    cluster_records = core.status(cluster_name, all_users=True)
+    # Run core.status in another thread to avoid blocking the event loop.
+    cluster_records = await context_utils.to_thread(core.status,
+                                                    cluster_name,
+                                                    all_users=True)
     cluster_record = cluster_records[0]
     if cluster_record['status'] != status_lib.ClusterStatus.UP:
         raise fastapi.HTTPException(
@@ -1818,6 +1823,35 @@ async def root():
     return fastapi.responses.RedirectResponse(url='/dashboard/')
+def _init_or_restore_server_user_hash():
+    """Restores the server user hash from the global user state db.
+    The API server must have a stable user hash across restarts and potential
+    multiple replicas. Thus we persist the user hash in db and restore it on
+    startup. When upgrading from old version, the user hash will be read from
+    the local file (if any) to keep the user hash consistent.
+    """
+    def apply_user_hash(user_hash: str) -> None:
+        # For local API server, the user hash in db and local file should be
+        # same so there is no harm to override here.
+        common_utils.set_user_hash_locally(user_hash)
+        # Refresh the server user hash for current process after restore or
+        # initialize the user hash in db, child processes will get the correct
+        # server id from the local cache file.
+        common_lib.refresh_server_id()
+    user_hash = global_user_state.get_system_config(_SERVER_USER_HASH_KEY)
+    if user_hash is not None:
+        apply_user_hash(user_hash)
+        return
+    # Initial deployment, generate a user hash and save it to the db.
+    user_hash = common_utils.get_user_hash()
+    global_user_state.set_system_config(_SERVER_USER_HASH_KEY, user_hash)
+    apply_user_hash(user_hash)
 if __name__ == '__main__':
     import uvicorn
@@ -1827,6 +1861,8 @@ if __name__ == '__main__':
     global_user_state.initialize_and_get_db()
     # Initialize request db
     requests_lib.reset_db_and_logs()
+    # Restore the server user hash
+    _init_or_restore_server_user_hash()
     parser = argparse.ArgumentParser()
     parser.add_argument('--host', default='127.0.0.1')

sky/setup_files/MANIFEST.in CHANGED Viewed

@@ -9,6 +9,7 @@ include sky/skylet/providers/ibm/*
 include sky/skylet/providers/scp/*
 include sky/skylet/providers/*.py
 include sky/skylet/ray_patches/*.patch
+include sky/skylet/ray_patches/*.diff
 include sky/jobs/dashboard/*
 include sky/jobs/dashboard/templates/*
 include sky/jobs/dashboard/static/*

sky/setup_files/dependencies.py CHANGED Viewed

@@ -72,12 +72,27 @@ install_requires = [
     'aiohttp',
 ]
+# See requirements-dev.txt for the version of grpc and protobuf
+# used to generate the code during development.
+# The grpc version at runtime has to be newer than the version
+# used to generate the code.
+GRPC = 'grpcio>=1.63.0'
+# >= 5.26.1 because the runtime version can't be older than the version
+# used to generate the code.
+# < 7.0.0 because code generated for a major version V will be supported by
+# protobuf runtimes of version V and V+1.
+# https://protobuf.dev/support/cross-version-runtime-guarantee
+PROTOBUF = 'protobuf>=5.26.1, < 7.0.0'
 server_dependencies = [
     'casbin',
     'sqlalchemy_adapter',
     'passlib',
     'pyjwt',
     'aiohttp',
+    GRPC,
+    PROTOBUF,
 ]
 local_ray = [
@@ -88,18 +103,9 @@ local_ray = [
     'ray[default] >= 2.2.0, != 2.6.0',
 ]
-# See requirements-dev.txt for the version of grpc and protobuf
-# used to generate the code during development.
 remote = [
-    # The grpc version at runtime has to be newer than the version
-    # used to generate the code.
-    'grpcio>=1.63.0',
-    # >= 5.26.1 because the runtime version can't be older than the version
-    # used to generate the code.
-    # < 7.0.0 because code generated for a major version V will be supported by
-    # protobuf runtimes of version V and V+1.
-    # https://protobuf.dev/support/cross-version-runtime-guarantee
-    'protobuf >= 5.26.1, < 7.0.0',
+    GRPC,
+    PROTOBUF,
 ]
 # NOTE: Change the templates/jobs-controller.yaml.j2 file if any of the

sky/skylet/ray_patches/__init__.py CHANGED Viewed

@@ -40,15 +40,29 @@ def _run_patch(target_file,
     """Applies a patch if it has not been applied already."""
     # .orig is the original file that is not patched.
     orig_file = os.path.abspath(f'{target_file}-v{version}.orig')
+    # Get diff filename by replacing .patch with .diff
+    diff_file = patch_file.replace('.patch', '.diff')
     script = f"""\
-    which patch >/dev/null 2>&1 || sudo yum install -y patch || sudo dnf install patch -y || true
-    which patch >/dev/null 2>&1 || (echo "`patch` is not found. Failed to setup ray." && exit 1)
+    which patch >/dev/null 2>&1 || sudo yum install -y patch || true
     if [ ! -f {orig_file} ]; then
         echo Create backup file {orig_file}
         cp {target_file} {orig_file}
     fi
-    # It is ok to patch again from the original file.
-    patch {orig_file} -i {patch_file} -o {target_file}
+    if which patch >/dev/null 2>&1; then
+        # System patch command is available, use it
+        # It is ok to patch again from the original file.
+        patch {orig_file} -i {patch_file} -o {target_file}
+    else
+        # System patch command not available, use Python patch library
+        echo "System patch command not available, using Python patch library..."
+        python -m pip install patch
+        # Get target directory
+        target_dir="$(dirname {target_file})"
+        # Execute python patch command
+        echo "Executing python -m patch -d $target_dir {diff_file}"
+        python -m patch -d "$target_dir" "{diff_file}"
+    fi
     """
     subprocess.run(script, shell=True, check=True)

sky/skylet/ray_patches/autoscaler.py.diff ADDED Viewed

@@ -0,0 +1,18 @@
+--- a/autoscaler.py
++++ b/autoscaler.py
+@@ -1,3 +1,6 @@
++# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/autoscaler.py
++# Sky patch changes:
++#  - enable upscaling_speed to be 0.0
+ import copy
+ import logging
+ import math
+@@ -1071,7 +1074,7 @@
+             upscaling_speed = self.config.get("upscaling_speed")
+             aggressive = self.config.get("autoscaling_mode") == "aggressive"
+             target_utilization_fraction = self.config.get("target_utilization_fraction")
+-            if upscaling_speed:
++            if upscaling_speed is not None: # NOTE(sky): enable 0.0
+                 upscaling_speed = float(upscaling_speed)
+             # TODO(ameer): consider adding (if users ask) an option of
+             # initial_upscaling_num_workers.

sky/skylet/ray_patches/cli.py.diff ADDED Viewed

@@ -0,0 +1,19 @@
+--- a/cli.py
++++ b/cli.py
+@@ -1,3 +1,7 @@
++# Adapted from https://github.com/ray-project/ray/blob/ray-2.9.3/dashboard/modules/job/cli.py
++# Fixed the problem in ray's issue https://github.com/ray-project/ray/issues/26514
++# Otherwise, the output redirection ">" will not work.
++
+ import json
+ import os
+ import sys
+@@ -270,7 +274,7 @@
+         working_dir=working_dir,
+     )
+     job_id = client.submit_job(
+-        entrypoint=list2cmdline(entrypoint),
++        entrypoint=" ".join(entrypoint),
+         submission_id=submission_id,
+         runtime_env=final_runtime_env,
+         metadata=metadata_json,

sky/skylet/ray_patches/command_runner.py.diff ADDED Viewed

@@ -0,0 +1,17 @@
+--- a/command_runner.py
++++ b/command_runner.py
+@@ -1,3 +1,5 @@
++# From https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/autoscaler/_private/command_runner.py
++
+ import hashlib
+ import json
+ import logging
+@@ -137,7 +139,7 @@
+                 {
+                     "ControlMaster": "auto",
+                     "ControlPath": "{}/%C".format(control_path),
+-                    "ControlPersist": "10s",
++                    "ControlPersist": "300s",
+                 }
+             )
+         self.arg_dict.update(kwargs)

sky/skylet/ray_patches/log_monitor.py.diff ADDED Viewed

@@ -0,0 +1,20 @@
+--- a/log_monitor.py
++++ b/log_monitor.py
+@@ -1,3 +1,7 @@
++# Original file https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/log_monitor.py
++# Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
++# We change the newline handling back to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
++
+ import argparse
+ import errno
+ import glob
+@@ -374,7 +378,8 @@
+                     next_line = next_line.decode("utf-8", "replace")
+                     if next_line == "":
+                         break
+-                    next_line = next_line.rstrip("\r\n")
++                    if next_line.endswith("\n"):
++                        next_line = next_line[:-1]
+                     if next_line.startswith(ray_constants.LOG_PREFIX_ACTOR_NAME):
+                         flush()  # Possible change of task/actor name.

skypilot-nightly 1.0.0.dev20250818__py3-none-any.whl → 1.0.0.dev20250820__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250818py3-none-any.whl → 1.0.0.dev20250820py3-none-any.whl