PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250718py3-none-any.whl → 1.0.0.dev20250723py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (160) hide show

sky/__init__.py +4 -2
sky/admin_policy.py +11 -4
sky/backends/backend_utils.py +50 -24
sky/backends/cloud_vm_ray_backend.py +41 -38
sky/catalog/__init__.py +3 -1
sky/catalog/aws_catalog.py +8 -5
sky/catalog/azure_catalog.py +8 -5
sky/catalog/common.py +8 -2
sky/catalog/cudo_catalog.py +5 -2
sky/catalog/do_catalog.py +4 -1
sky/catalog/fluidstack_catalog.py +5 -2
sky/catalog/gcp_catalog.py +8 -5
sky/catalog/hyperbolic_catalog.py +5 -2
sky/catalog/ibm_catalog.py +8 -5
sky/catalog/lambda_catalog.py +8 -5
sky/catalog/nebius_catalog.py +8 -5
sky/catalog/oci_catalog.py +8 -5
sky/catalog/paperspace_catalog.py +4 -1
sky/catalog/runpod_catalog.py +5 -2
sky/catalog/scp_catalog.py +8 -5
sky/catalog/vast_catalog.py +5 -2
sky/catalog/vsphere_catalog.py +4 -1
sky/client/cli/command.py +63 -25
sky/client/sdk.py +61 -11
sky/clouds/aws.py +12 -7
sky/clouds/azure.py +12 -7
sky/clouds/cloud.py +9 -8
sky/clouds/cudo.py +13 -7
sky/clouds/do.py +12 -7
sky/clouds/fluidstack.py +11 -6
sky/clouds/gcp.py +12 -7
sky/clouds/hyperbolic.py +11 -6
sky/clouds/ibm.py +11 -6
sky/clouds/kubernetes.py +7 -3
sky/clouds/lambda_cloud.py +11 -6
sky/clouds/nebius.py +14 -12
sky/clouds/oci.py +12 -7
sky/clouds/paperspace.py +12 -7
sky/clouds/runpod.py +12 -7
sky/clouds/scp.py +11 -6
sky/clouds/vast.py +14 -8
sky/clouds/vsphere.py +11 -6
sky/core.py +6 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/{1043-734e57d2b27dfe5d.js → 1043-869d9c78bf5dd3df.js} +1 -1
sky/dashboard/out/_next/static/chunks/{1141-d8c6404a7c6fffe6.js → 1141-e49a159c30a6c4a7.js} +1 -1
sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +30 -0
sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +6 -0
sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +1 -0
sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +15 -0
sky/dashboard/out/_next/static/chunks/{2641.35edc9ccaeaad9e3.js → 2641.74c19c4d45a2c034.js} +1 -1
sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +1 -0
sky/dashboard/out/_next/static/chunks/{4725.4c849b1e05c8e9ad.js → 4725.66125dcd9832aa5d.js} +1 -1
sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +16 -0
sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +15 -0
sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +1 -0
sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +55 -0
sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +1 -0
sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +41 -0
sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +6 -0
sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +1 -0
sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +30 -0
sky/dashboard/out/_next/static/chunks/{9984.2b5e3fa69171bff9.js → 9984.0460de9d3adf5582.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-fa406155b4223d0d.js → [job]-2186770cc2de1623.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-0c37ee1ac5f3474d.js → [cluster]-95afb019ab85801c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/{[job]-c5b357bfd9502fbe.js → [job]-dc0299ffefebcdbe.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/{users-19e98664bdd61643.js → users-6790fcefd5487b13.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +1 -0
sky/dashboard/out/_next/static/css/b3227360726f12eb.css +3 -0
sky/dashboard/out/_next/static/mym3Ciwp-zqU7ZpOLGnrW/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/mounting_utils.py +93 -32
sky/global_user_state.py +12 -143
sky/jobs/state.py +9 -88
sky/jobs/utils.py +28 -13
sky/provision/nebius/utils.py +3 -6
sky/schemas/db/README +4 -0
sky/schemas/db/env.py +90 -0
sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
sky/schemas/db/script.py.mako +28 -0
sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
sky/serve/client/sdk.py +6 -2
sky/serve/controller.py +7 -3
sky/serve/serve_state.py +1 -1
sky/serve/serve_utils.py +171 -75
sky/serve/server/core.py +17 -6
sky/server/common.py +4 -3
sky/server/requests/payloads.py +2 -0
sky/server/requests/requests.py +1 -1
sky/setup_files/MANIFEST.in +2 -0
sky/setup_files/alembic.ini +148 -0
sky/setup_files/dependencies.py +1 -0
sky/skylet/configs.py +1 -1
sky/skylet/constants.py +4 -0
sky/skylet/job_lib.py +1 -1
sky/skypilot_config.py +1 -1
sky/users/permission.py +1 -1
sky/utils/common_utils.py +85 -3
sky/utils/config_utils.py +15 -0
sky/utils/db/__init__.py +0 -0
sky/utils/{db_utils.py → db/db_utils.py} +59 -0
sky/utils/db/migration_utils.py +93 -0
sky/utils/locks.py +319 -0
sky/utils/schemas.py +38 -34
sky/utils/timeline.py +41 -0
{skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/METADATA +2 -1
{skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/RECORD +134 -125
sky/dashboard/out/_next/static/FUjweqdImyeYhMYFON-Se/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/1746.27d40aedc22bd2d6.js +0 -60
sky/dashboard/out/_next/static/chunks/1871-76491ac174a95278.js +0 -6
sky/dashboard/out/_next/static/chunks/2544.27f70672535675ed.js +0 -1
sky/dashboard/out/_next/static/chunks/2875.c24c6d57dc82e436.js +0 -25
sky/dashboard/out/_next/static/chunks/3785.95b94f18aaec7233.js +0 -1
sky/dashboard/out/_next/static/chunks/3947-b059261d6fa88a1f.js +0 -35
sky/dashboard/out/_next/static/chunks/430.ed51037d1a4a438b.js +0 -1
sky/dashboard/out/_next/static/chunks/4869.bdd42f14b51d1d6f.js +0 -16
sky/dashboard/out/_next/static/chunks/5491.918ffed0ba7a5294.js +0 -20
sky/dashboard/out/_next/static/chunks/6990-dcb411b566e64cde.js +0 -1
sky/dashboard/out/_next/static/chunks/804-9f5e98ce84d46bdd.js +0 -21
sky/dashboard/out/_next/static/chunks/9025.133e9ba5c780afeb.js +0 -6
sky/dashboard/out/_next/static/chunks/938-6a9ffdaa21eee969.js +0 -1
sky/dashboard/out/_next/static/chunks/9470-b6f6a35283863a6f.js +0 -1
sky/dashboard/out/_next/static/chunks/9847.46e613d000c55859.js +0 -30
sky/dashboard/out/_next/static/chunks/pages/_app-771a40cde532309b.js +0 -20
sky/dashboard/out/_next/static/chunks/pages/clusters-102d169e87913ba1.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/index-927ddeebe57a8ac3.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-8b0809f59034d509.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/infra-ae9d2f705ce582c9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs-5bbdc71878f0a068.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-7c0187f43757a548.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces-a1e43d9ef51a9cea.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-6b0575ea521af4f3.js +0 -1
sky/dashboard/out/_next/static/css/219887b94512388c.css +0 -3
/sky/dashboard/out/_next/static/{FUjweqdImyeYhMYFON-Se → mym3Ciwp-zqU7ZpOLGnrW}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250718.dist-info → skypilot_nightly-1.0.0.dev20250723.dist-info}/top_level.txt +0 -0

sky/__init__.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional
 import urllib.request
 # Replaced with the current commit when building the wheels.
-_SKYPILOT_COMMIT_SHA = '663a28261fc98dfa69214e1d4f1b0bb7b02664e0'
+_SKYPILOT_COMMIT_SHA = '874bc28c3a4b7322d30cfc544b257647379b59ed'
 def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
 __commit__ = _get_git_commit()
-__version__ = '1.0.0.dev20250718'
+__version__ = '1.0.0.dev20250723'
 __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -104,6 +104,7 @@ from sky.client.sdk import job_status
 from sky.client.sdk import launch
 from sky.client.sdk import optimize
 from sky.client.sdk import queue
+from sky.client.sdk import reload_config
 from sky.client.sdk import start
 from sky.client.sdk import status
 from sky.client.sdk import stop
@@ -185,6 +186,7 @@ __all__ = [
     'optimize',
     'launch',
     'exec',
+    'reload_config',
     # core APIs
     'status',
     'start',

sky/admin_policy.py CHANGED Viewed

@@ -121,11 +121,17 @@ class MutatedUserRequest:
                 dict(self.skypilot_config),)).model_dump_json()
     @classmethod
-    def decode(cls, mutated_user_request_body: str) -> 'MutatedUserRequest':
+    def decode(cls, mutated_user_request_body: str,
+               original_request: UserRequest) -> 'MutatedUserRequest':
         mutated_user_request_body = _MutatedUserRequestBody.model_validate_json(
             mutated_user_request_body)
-        return cls(task=sky.Task.from_yaml_config(
-            common_utils.read_yaml_all_str(mutated_user_request_body.task)[0]),
+        task = sky.Task.from_yaml_config(
+            common_utils.read_yaml_all_str(mutated_user_request_body.task)[0])
+        # Some internal Task fields are not serialized. We need to manually
+        # restore them from the original request.
+        task.managed_job_dag = original_request.task.managed_job_dag
+        task.service_name = original_request.task.service_name
+        return cls(task=task,
                    skypilot_config=config_utils.Config.from_dict(
                        common_utils.read_yaml_all_str(
                            mutated_user_request_body.skypilot_config)[0],))
@@ -243,7 +249,8 @@ class RestfulAdminPolicy(PolicyTemplate):
                     f'{self.policy_url}: {e}') from None
         try:
-            mutated_user_request = MutatedUserRequest.decode(response.json())
+            mutated_user_request = MutatedUserRequest.decode(
+                response.json(), user_request)
         except Exception as e:  # pylint: disable=broad-except
             with ux_utils.print_exception_no_traceback():
                 raise exceptions.RestfulPolicyError(

sky/backends/backend_utils.py CHANGED Viewed

@@ -17,7 +17,6 @@ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
 import uuid
 import colorama
-import filelock
 from packaging import version
 from typing_extensions import Literal
@@ -45,6 +44,7 @@ from sky.utils import common_utils
 from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import env_options
+from sky.utils import locks
 from sky.utils import registry
 from sky.utils import resources_utils
 from sky.utils import rich_utils
@@ -104,23 +104,18 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
 # Fixed IP addresses are used to avoid DNS lookup blocking the check, for
 # machine with no internet connection.
 # Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
-_TEST_IP_LIST = ['https://1.1.1.1', 'https://8.8.8.8']
+_TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
 # Allow each CPU thread take 2 tasks.
 # Note: This value cannot be too small, otherwise OOM issue may occur.
 DEFAULT_TASK_CPU_DEMAND = 0.5
-# Filelocks for the cluster status change.
-CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
 CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
 # Time that must elapse since the last status check before we should re-check if
 # the cluster has been terminated or autostopped.
 _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
-# Filelocks for updating cluster's file_mounts.
-CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
-    '~/.sky/.{}_file_mounts.lock')
 CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
 # Remote dir that holds our runtime files.
@@ -1635,18 +1630,28 @@ def get_node_ips(cluster_yaml: str,
 def check_network_connection():
     # Tolerate 3 retries as it is observed that connections can fail.
-    adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
     http = requests.Session()
-    http.mount('https://', adapter)
-    http.mount('http://', adapter)
-    for i, ip in enumerate(_TEST_IP_LIST):
-        try:
-            http.head(ip, timeout=3)
-            return
-        except (requests.Timeout, requests.exceptions.ConnectionError) as e:
-            if i == len(_TEST_IP_LIST) - 1:
-                raise exceptions.NetworkError('Could not refresh the cluster. '
-                                              'Network seems down.') from e
+    http.mount('https://', adapters.HTTPAdapter())
+    http.mount('http://', adapters.HTTPAdapter())
+    # Alternate between IPs on each retry
+    max_retries = 3
+    timeout = 0.5
+    for _ in range(max_retries):
+        for ip in _TEST_IP_LIST:
+            try:
+                http.head(ip, timeout=timeout)
+                return
+            except (requests.Timeout, requests.exceptions.ConnectionError):
+                continue
+        timeout *= 2  # Double the timeout for next retry
+    # If we get here, all IPs failed
+    # Assume network connection is down
+    raise exceptions.NetworkError('Could not refresh the cluster. '
+                                  'Network seems down.')
 @timeline.event
@@ -1995,9 +2000,20 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
             total_nodes = handle.launched_nodes * handle.num_ips_per_node
+            cloud_name = repr(handle.launched_resources.cloud).lower()
             for i in range(5):
-                ready_head, ready_workers, output, stderr = (
-                    get_node_counts_from_ray_status(head_runner))
+                try:
+                    ready_head, ready_workers, output, stderr = (
+                        get_node_counts_from_ray_status(head_runner))
+                except RuntimeError as e:
+                    logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
+                                 f' {i}: {common_utils.format_exception(e)}')
+                    if cloud_name != 'kubernetes':
+                        raise e
+                    # We retry for kubernetes because coreweave can have a
+                    # transient network issue.
+                    time.sleep(1)
+                    continue
                 if ready_head + ready_workers == total_nodes:
                     return True
                 logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
@@ -2284,8 +2300,7 @@ def refresh_cluster_record(
         # The loop logic allows us to notice if the status was updated in the
         # global_user_state by another process and stop trying to get the lock.
-        # The core loop logic is adapted from FileLock's implementation.
-        lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
+        lock = locks.get_lock(cluster_status_lock_id(cluster_name))
         start_time = time.perf_counter()
         # Loop until we have an up-to-date status or until we acquire the lock.
@@ -2309,7 +2324,8 @@ def refresh_cluster_record(
                         return record
                     # Update and return the cluster status.
                     return _update_cluster_status(cluster_name)
-            except filelock.Timeout:
+            except locks.LockTimeout:
                 # lock.acquire() will throw a Timeout exception if the lock is not
                 # available and we have blocking=False.
                 pass
@@ -2610,7 +2626,7 @@ def is_controller_accessible(
           need_connection_check):
         # Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
         # status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
-        # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
+        # `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
         # we can allow access to the controller.
         ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
                                                    handle.docker_user,
@@ -3187,3 +3203,13 @@ def get_endpoints(cluster: str,
         return {
             port_num: urls[0].url() for port_num, urls in port_details.items()
         }
+def cluster_status_lock_id(cluster_name: str) -> str:
+    """Get the lock ID for cluster status operations."""
+    return f'{cluster_name}_status'
+def cluster_file_mounts_lock_id(cluster_name: str) -> str:
+    """Get the lock ID for cluster file mounts operations."""
+    return f'{cluster_name}_file_mounts'

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -20,7 +20,6 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
                     Union)
 import colorama
-import filelock
 import yaml
 import sky
@@ -64,6 +63,7 @@ from sky.utils import common_utils
 from sky.utils import context_utils
 from sky.utils import controller_utils
 from sky.utils import env_options
+from sky.utils import locks
 from sky.utils import log_utils
 from sky.utils import message_utils
 from sky.utils import registry
@@ -2916,9 +2916,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # Check if the cluster is owned by the current user. Raise
         # exceptions.ClusterOwnerIdentityMismatchError
         backend_utils.check_owner_identity(cluster_name)
-        lock_path = os.path.expanduser(
-            backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
-        with timeline.FileLockEvent(lock_path):
+        lock_id = backend_utils.cluster_status_lock_id(cluster_name)
+        with timeline.DistributedLockEvent(lock_id):
             # Try to launch the exiting cluster first. If no existing cluster,
             # this function will create a to_provision_config with required
             # resources.
@@ -3065,7 +3064,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 self._update_after_cluster_provisioned(
                     handle, to_provision_config.prev_handle, task,
-                    prev_cluster_status, lock_path, config_hash)
+                    prev_cluster_status, lock_id, config_hash)
                 return handle, False
             cluster_config_file = config_dict['ray']
@@ -3137,7 +3136,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             self._update_after_cluster_provisioned(
                 handle, to_provision_config.prev_handle, task,
-                prev_cluster_status, lock_path, config_hash)
+                prev_cluster_status, lock_id, config_hash)
             return handle, False
     def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3155,7 +3154,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             prev_handle: Optional[CloudVmRayResourceHandle],
             task: task_lib.Task,
             prev_cluster_status: Optional[status_lib.ClusterStatus],
-            lock_path: str, config_hash: str) -> None:
+            lock_id: str, config_hash: str) -> None:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, handle.launched_resources)
         usage_lib.messages.usage.update_final_cluster_status(
@@ -3237,7 +3236,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 handle.cached_external_ssh_ports, handle.docker_user,
                 handle.ssh_user)
-            common_utils.remove_file_if_exists(lock_path)
+            locks.get_lock(lock_id).force_unlock()
     def _sync_workdir(self, handle: CloudVmRayResourceHandle,
                       workdir: Union[Path, Dict[str, Any]],
@@ -3819,8 +3818,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 is_identity_mismatch_and_purge = True
             else:
                 raise
-        lock_path = os.path.expanduser(
-            backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
+        lock_id = backend_utils.cluster_status_lock_id(cluster_name)
+        lock = locks.get_lock(lock_id)
         # Retry in case new cluster operation comes in and holds the lock
         # right after the lock is removed.
         n_attempts = 2
@@ -3828,7 +3827,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             n_attempts -= 1
             # In case other running cluster operations are still holding the
             # lock.
-            common_utils.remove_file_if_exists(lock_path)
+            lock.force_unlock()
             # We have to kill the cluster requests, because `down` and `stop`
             # should be higher priority than the cluster requests, and we should
             # release the lock from other requests.
@@ -3847,9 +3846,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     f'cluster {handle.cluster_name}: '
                     f'{common_utils.format_exception(e, use_bracket=True)}')
             try:
-                with filelock.FileLock(
-                        lock_path,
-                        backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
+                with lock:
                     self.teardown_no_lock(
                         handle,
                         terminate,
@@ -3862,14 +3859,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         refresh_cluster_status=(
                             not is_identity_mismatch_and_purge))
                 if terminate:
-                    common_utils.remove_file_if_exists(lock_path)
+                    lock.force_unlock()
                 break
-            except filelock.Timeout as e:
+            except locks.LockTimeout as e:
                 logger.debug(f'Failed to acquire lock for {cluster_name}, '
                              f'retrying...')
                 if n_attempts <= 0:
                     raise RuntimeError(
-                        f'Cluster {cluster_name!r} is locked by {lock_path}. '
+                        f'Cluster {cluster_name!r} is locked by {lock_id}. '
                         'Check to see if it is still being launched') from e
     # --- CloudVMRayBackend Specific APIs ---
@@ -3988,12 +3985,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         return dict(zip(job_ids, local_log_dirs))
     @context_utils.cancellation_guard
-    def tail_logs(self,
-                  handle: CloudVmRayResourceHandle,
-                  job_id: Optional[int],
-                  managed_job_id: Optional[int] = None,
-                  follow: bool = True,
-                  tail: int = 0) -> int:
+    def tail_logs(
+            self,
+            handle: CloudVmRayResourceHandle,
+            job_id: Optional[int],
+            managed_job_id: Optional[int] = None,
+            follow: bool = True,
+            tail: int = 0,
+            require_outputs: bool = False,
+            stream_logs: bool = True,
+            process_stream: bool = False) -> Union[int, Tuple[int, str, str]]:
         """Tail the logs of a job.
         Args:
@@ -4003,6 +4004,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             follow: Whether to follow the logs.
             tail: The number of lines to display from the end of the
                 log file. If 0, print all lines.
+            require_outputs: Whether to return the stdout/stderr of the command.
+            stream_logs: Whether to stream the logs to stdout/stderr.
+            process_stream: Whether to process the stream.
         Returns:
             The exit code of the tail command. Returns code 100 if the job has
@@ -4022,18 +4026,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
             signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
         try:
-            returncode = self.run_on_head(
+            final = self.run_on_head(
                 handle,
                 code,
-                stream_logs=True,
-                process_stream=False,
+                stream_logs=stream_logs,
+                process_stream=process_stream,
+                require_outputs=require_outputs,
                 # Allocate a pseudo-terminal to disable output buffering.
                 # Otherwise, there may be 5 minutes delay in logging.
                 ssh_mode=command_runner.SshMode.INTERACTIVE,
             )
         except SystemExit as e:
-            returncode = e.code
-        return returncode
+            final = e.code
+        return final
     def tail_managed_job_logs(self,
                               handle: CloudVmRayResourceHandle,
@@ -5237,18 +5242,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # reconstruct them during cluster restart.
                 continue
             storage_mounts_metadata[dst] = storage_obj.handle
-        lock_path = (
-            backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
+        lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
         lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
         try:
-            with filelock.FileLock(lock_path, lock_timeout):
+            with locks.get_lock(lock_id, lock_timeout):
                 global_user_state.set_cluster_storage_mounts_metadata(
                     cluster_name, storage_mounts_metadata)
-        except filelock.Timeout as e:
+        except locks.LockTimeout as e:
             raise RuntimeError(
                 f'Failed to store metadata for cluster {cluster_name!r} due to '
                 'a timeout when trying to access local database. Please '
-                f'try again or manually remove the lock at {lock_path}. '
+                f'try again or manually remove the lock at {lock_id}. '
                 f'{common_utils.format_exception(e)}') from None
     def get_storage_mounts_metadata(
@@ -5259,19 +5263,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         After retrieving storage_mounts_metadata, it converts back the
         StorageMetadata to Storage object and restores 'storage_mounts.'
         """
-        lock_path = (
-            backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
+        lock_id = backend_utils.cluster_file_mounts_lock_id(cluster_name)
         lock_timeout = backend_utils.CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS
         try:
-            with filelock.FileLock(lock_path, lock_timeout):
+            with locks.get_lock(lock_id, lock_timeout):
                 storage_mounts_metadata = (
                     global_user_state.get_cluster_storage_mounts_metadata(
                         cluster_name))
-        except filelock.Timeout as e:
+        except locks.LockTimeout as e:
             raise RuntimeError(
                 f'Failed to retrieve metadata for cluster {cluster_name!r} '
                 'due to a timeout when trying to access local database. '
-                f'Please try again or manually remove the lock at {lock_path}.'
+                f'Please try again or manually remove the lock at {lock_id}.'
                 f' {common_utils.format_exception(e)}') from None
         if storage_mounts_metadata is None:

sky/catalog/__init__.py CHANGED Viewed

@@ -221,6 +221,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
                               memory: Optional[str] = None,
                               disk_tier: Optional[
                                   resources_utils.DiskTier] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None,
                               clouds: CloudFilter = None) -> Optional[str]:
     """Returns the cloud's default instance type for given #vCPUs and memory.
@@ -234,7 +236,7 @@ def get_default_instance_type(cpus: Optional[str] = None,
     the given CPU and memory requirement.
     """
     return _map_clouds_catalog(clouds, 'get_default_instance_type', cpus,
-                               memory, disk_tier)
+                               memory, disk_tier, region, zone)
 def get_accelerators_from_instance_type(

sky/catalog/aws_catalog.py CHANGED Viewed

@@ -230,10 +230,12 @@ def get_vcpus_mem_from_instance_type(
                                                         instance_type)
-def get_default_instance_type(
-        cpus: Optional[str] = None,
-        memory: Optional[str] = None,
-        disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
+def get_default_instance_type(cpus: Optional[str] = None,
+                              memory: Optional[str] = None,
+                              disk_tier: Optional[
+                                  resources_utils.DiskTier] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
     del disk_tier  # unused
     if cpus is None and memory is None:
         cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -247,7 +249,8 @@ def get_default_instance_type(
     df = _get_df()
     df = df[df['InstanceType'].str.startswith(instance_type_prefix)]
     return common.get_instance_type_for_cpus_mem_impl(df, cpus,
-                                                      memory_gb_or_ratio)
+                                                      memory_gb_or_ratio,
+                                                      region, zone)
 def get_accelerators_from_instance_type(

sky/catalog/azure_catalog.py CHANGED Viewed

@@ -114,10 +114,12 @@ def _get_instance_family(instance_type: str) -> str:
     return instance_family
-def get_default_instance_type(
-        cpus: Optional[str] = None,
-        memory: Optional[str] = None,
-        disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
+def get_default_instance_type(cpus: Optional[str] = None,
+                              memory: Optional[str] = None,
+                              disk_tier: Optional[
+                                  resources_utils.DiskTier] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
     if cpus is None and memory is None:
         cpus = f'{_DEFAULT_NUM_VCPUS}+'
     if memory is None:
@@ -133,7 +135,8 @@ def get_default_instance_type(
     df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
     return common.get_instance_type_for_cpus_mem_impl(df, cpus,
-                                                      memory_gb_or_ratio)
+                                                      memory_gb_or_ratio,
+                                                      region, zone)
 def get_accelerators_from_instance_type(

sky/catalog/common.py CHANGED Viewed

@@ -476,8 +476,11 @@ def _filter_region_zone(df: 'pd.DataFrame', region: Optional[str],
 def get_instance_type_for_cpus_mem_impl(
-        df: 'pd.DataFrame', cpus: Optional[str],
-        memory_gb_or_ratio: Optional[str]) -> Optional[str]:
+        df: 'pd.DataFrame',
+        cpus: Optional[str],
+        memory_gb_or_ratio: Optional[str],
+        region: Optional[str] = None,
+        zone: Optional[str] = None) -> Optional[str]:
     """Returns the cheapest instance type that satisfies the requirements.
     Args:
@@ -490,7 +493,10 @@ def get_instance_type_for_cpus_mem_impl(
             returned instance type should have at least the given memory size.
             If the string ends with "x", then the returned instance type should
             have at least the given number of vCPUs times the given ratio.
+        region: The region to filter by.
+        zone: The zone to filter by.
     """
+    df = _filter_region_zone(df, region, zone)
     df = _filter_with_cpus(df, cpus)
     df = _filter_with_mem(df, memory_gb_or_ratio)
     if df.empty:

sky/catalog/cudo_catalog.py CHANGED Viewed

@@ -51,7 +51,9 @@ def get_vcpus_mem_from_instance_type(
 def get_default_instance_type(cpus: Optional[str] = None,
                               memory: Optional[str] = None,
-                              disk_tier: Optional[str] = None) -> Optional[str]:
+                              disk_tier: Optional[str] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
     del disk_tier
     # NOTE: After expanding catalog to multiple entries, you may
     # want to specify a default instance type or family.
@@ -62,7 +64,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
     if memory is None:
         memory_gb_or_ratio = f'{_DEFAULT_MEMORY_CPU_RATIO}x'
     return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
-                                                      memory_gb_or_ratio)
+                                                      memory_gb_or_ratio,
+                                                      region, zone)
 def get_accelerators_from_instance_type(

sky/catalog/do_catalog.py CHANGED Viewed

@@ -52,11 +52,14 @@ def get_default_instance_type(
     cpus: Optional[str] = None,
     memory: Optional[str] = None,
     disk_tier: Optional[str] = None,
+    region: Optional[str] = None,
+    zone: Optional[str] = None,
 ) -> Optional[str]:
     # NOTE: After expanding catalog to multiple entries, you may
     # want to specify a default instance type or family.
     del disk_tier  # unused
-    return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
+    return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
+                                                      zone)
 def get_accelerators_from_instance_type(

sky/catalog/fluidstack_catalog.py CHANGED Viewed

@@ -52,7 +52,9 @@ def get_vcpus_mem_from_instance_type(
 def get_default_instance_type(cpus: Optional[str] = None,
                               memory: Optional[str] = None,
-                              disk_tier: Optional[str] = None) -> Optional[str]:
+                              disk_tier: Optional[str] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
     del disk_tier  # unused
     if cpus is None and memory is None:
         cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -61,7 +63,8 @@ def get_default_instance_type(cpus: Optional[str] = None,
     else:
         memory_gb_or_ratio = memory
     return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
-                                                      memory_gb_or_ratio)
+                                                      memory_gb_or_ratio,
+                                                      region, zone)
 def get_accelerators_from_instance_type(

sky/catalog/gcp_catalog.py CHANGED Viewed

@@ -279,10 +279,12 @@ def get_vcpus_mem_from_instance_type(
     return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
-def get_default_instance_type(
-        cpus: Optional[str] = None,
-        memory: Optional[str] = None,
-        disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
+def get_default_instance_type(cpus: Optional[str] = None,
+                              memory: Optional[str] = None,
+                              disk_tier: Optional[
+                                  resources_utils.DiskTier] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
     if cpus is None and memory is None:
         cpus = f'{_DEFAULT_NUM_VCPUS}+'
     if memory is None:
@@ -300,7 +302,8 @@ def get_default_instance_type(
     df = df.loc[df['InstanceType'].apply(_filter_disk_type)]
     return common.get_instance_type_for_cpus_mem_impl(df, cpus,
-                                                      memory_gb_or_ratio)
+                                                      memory_gb_or_ratio,
+                                                      region, zone)
 def get_accelerators_from_instance_type(

sky/catalog/hyperbolic_catalog.py CHANGED Viewed

@@ -67,9 +67,12 @@ def get_zone_shell_cmd() -> Optional[str]:
 def get_default_instance_type(cpus: Optional[str] = None,
                               memory: Optional[str] = None,
-                              disk_tier: Optional[str] = None) -> Optional[str]:
+                              disk_tier: Optional[str] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
     del disk_tier  # Unused
-    return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
+    return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
+                                                      zone)
 def get_instance_type_for_accelerator(

sky/catalog/ibm_catalog.py CHANGED Viewed

@@ -92,10 +92,12 @@ def list_accelerators(
                                          case_sensitive, all_regions)
-def get_default_instance_type(
-        cpus: Optional[str] = None,
-        memory: Optional[str] = None,
-        disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
+def get_default_instance_type(cpus: Optional[str] = None,
+                              memory: Optional[str] = None,
+                              disk_tier: Optional[
+                                  resources_utils.DiskTier] = None,
+                              region: Optional[str] = None,
+                              zone: Optional[str] = None) -> Optional[str]:
     del disk_tier  # unused
     if cpus is None and memory is None:
         cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -107,7 +109,8 @@ def get_default_instance_type(
     instance_type_prefix = f'{_DEFAULT_INSTANCE_FAMILY}-'
     df = _df[_df['InstanceType'].str.startswith(instance_type_prefix)]
     return common.get_instance_type_for_cpus_mem_impl(df, cpus,
-                                                      memory_gb_or_ratio)
+                                                      memory_gb_or_ratio,
+                                                      region, zone)
 def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:

skypilot-nightly 1.0.0.dev20250718__py3-none-any.whl → 1.0.0.dev20250723__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250718py3-none-any.whl → 1.0.0.dev20250723py3-none-any.whl