PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250903__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250902py3-none-any.whl → 1.0.0.dev20250903py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (59) hide show

sky/__init__.py +2 -2
sky/adaptors/runpod.py +68 -0
sky/backends/backend_utils.py +5 -3
sky/client/cli/command.py +20 -5
sky/clouds/kubernetes.py +1 -1
sky/clouds/runpod.py +17 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1121-ec35954c8cbea535.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b77360a343d48902.js +16 -0
sky/dashboard/out/_next/static/chunks/{webpack-0eaa6f7e63f51311.js → webpack-60556df644cd5d71.js} +1 -1
sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → yLz6EPhW_XXmnNs1I6dmS}/_buildManifest.js +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/global_user_state.py +5 -2
sky/models.py +1 -0
sky/provision/runpod/__init__.py +3 -0
sky/provision/runpod/instance.py +17 -0
sky/provision/runpod/utils.py +23 -5
sky/provision/runpod/volume.py +158 -0
sky/server/requests/payloads.py +7 -1
sky/server/requests/preconditions.py +8 -7
sky/server/requests/requests.py +123 -57
sky/server/server.py +32 -25
sky/server/stream_utils.py +14 -6
sky/server/uvicorn.py +2 -1
sky/templates/kubernetes-ray.yml.j2 +5 -5
sky/templates/runpod-ray.yml.j2 +8 -0
sky/utils/benchmark_utils.py +60 -0
sky/utils/command_runner.py +4 -0
sky/utils/db/migration_utils.py +20 -4
sky/utils/resource_checker.py +6 -5
sky/utils/schemas.py +1 -1
sky/utils/volume.py +3 -0
sky/volumes/client/sdk.py +28 -0
sky/volumes/server/server.py +11 -1
sky/volumes/utils.py +117 -68
sky/volumes/volume.py +98 -39
{skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/METADATA +34 -34
{skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/RECORD +57 -55
sky/dashboard/out/_next/static/chunks/1121-8afcf719ea87debc.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-06afb50d25f7c61f.js +0 -16
/sky/dashboard/out/_next/static/{tio0QibqY2C0F2-rPy00p → yLz6EPhW_XXmnNs1I6dmS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20250902.dist-info → skypilot_nightly-1.0.0.dev20250903.dist-info}/top_level.txt +0 -0

sky/dashboard/out/workspaces/[name].html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~0eaa6f7e63f51311~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-~~8afcf719ea87debc~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-8089ed1e0b7e37fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/~~tio0QibqY2C0F2-rPy00p~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~tio0QibqY2C0F2-rPy00p~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"~~tio0QibqY2C0F2-rPy00p~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60556df644cd5d71.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-3d59f75e2ccf9321.js" defer=""></script><script src="/dashboard/_next/static/chunks/6130-2be46d70a38f1e82.js" defer=""></script><script src="/dashboard/_next/static/chunks/5739-d67458fcb1386c92.js" defer=""></script><script src="/dashboard/_next/static/chunks/7411-b15471acd2cba716.js" defer=""></script><script src="/dashboard/_next/static/chunks/1272-1ef0bf0237faccdb.js" defer=""></script><script src="/dashboard/_next/static/chunks/7205-88191679e7988c57.js" defer=""></script><script src="/dashboard/_next/static/chunks/6989-01359c57e018caa4.js" defer=""></script><script src="/dashboard/_next/static/chunks/3850-ff4a9a69d978632b.js" defer=""></script><script src="/dashboard/_next/static/chunks/8969-4a6f1a928fb6d370.js" defer=""></script><script src="/dashboard/_next/static/chunks/6990-08b2a1cae076a943.js" defer=""></script><script src="/dashboard/_next/static/chunks/6135-4b4d5e824b7f9d3c.js" defer=""></script><script src="/dashboard/_next/static/chunks/1121-ec35954c8cbea535.js" defer=""></script><script src="/dashboard/_next/static/chunks/6601-06114c982db410b6.js" defer=""></script><script src="/dashboard/_next/static/chunks/3015-8089ed1e0b7e37fd.js" defer=""></script><script src="/dashboard/_next/static/chunks/1141-943efc7aff0f0c06.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-de06e613e20bc977.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"yLz6EPhW_XXmnNs1I6dmS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/dashboard/out/workspaces.html CHANGED Viewed

	@@ -1 +1 @@
1	- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-~~0eaa6f7e63f51311~~.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/~~tio0QibqY2C0F2-rPy00p~~/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/~~tio0QibqY2C0F2-rPy00p~~/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"~~tio0QibqY2C0F2-rPy00p~~","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1	+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/4614e06482d7309e.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/4614e06482d7309e.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-60556df644cd5d71.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-cf60a09ccd051a10.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-f15ccb73239a3bf1.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-ce361c6959bc2001.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-be35b22e2046564c.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/yLz6EPhW_XXmnNs1I6dmS/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"yLz6EPhW_XXmnNs1I6dmS","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>

sky/global_user_state.py CHANGED Viewed

@@ -299,7 +299,9 @@ def create_table(engine: sqlalchemy.engine.Engine):
 # a session has already been created with _SQLALCHEMY_ENGINE = e1,
 # and then another thread overwrites _SQLALCHEMY_ENGINE = e2
 # which could result in e1 being garbage collected unexpectedly.
-def initialize_and_get_db() -> sqlalchemy.engine.Engine:
+def initialize_and_get_db(
+    pg_pool_class: Optional[sqlalchemy.pool.Pool] = None
+) -> sqlalchemy.engine.Engine:
     global _SQLALCHEMY_ENGINE
     if _SQLALCHEMY_ENGINE is not None:
@@ -308,7 +310,8 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
         if _SQLALCHEMY_ENGINE is not None:
             return _SQLALCHEMY_ENGINE
         # get an engine to the db
-        engine = migration_utils.get_engine('state')
+        engine = migration_utils.get_engine('state',
+                                            pg_pool_class=pg_pool_class)
         # run migrations if needed
         create_table(engine)

sky/models.py CHANGED Viewed

@@ -109,3 +109,4 @@ class VolumeConfig(pydantic.BaseModel):
     size: Optional[str]
     config: Dict[str, Any] = {}
     labels: Optional[Dict[str, str]] = None
+    id_on_cloud: Optional[str] = None

sky/provision/runpod/__init__.py CHANGED Viewed

@@ -9,3 +9,6 @@ from sky.provision.runpod.instance import run_instances
 from sky.provision.runpod.instance import stop_instances
 from sky.provision.runpod.instance import terminate_instances
 from sky.provision.runpod.instance import wait_instances
+from sky.provision.runpod.volume import apply_volume
+from sky.provision.runpod.volume import delete_volume
+from sky.provision.runpod.volume import get_volume_usedby

sky/provision/runpod/instance.py CHANGED Viewed

@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
             created_instance_ids=[])
     created_instance_ids = []
+    volume_mounts = config.node_config.get('VolumeMounts', [])
+    network_volume_id = None
+    volume_mount_path = None
+    if volume_mounts:
+        if len(volume_mounts) > 1:
+            logger.warning(
+                f'RunPod only supports one network volume mount, '
+                f'but {len(volume_mounts)} are specified. Only the first one '
+                f'will be used.')
+        volume_mount = volume_mounts[0]
+        network_volume_id = volume_mount.get('VolumeIdOnCloud')
+        volume_mount_path = volume_mount.get('MountPath')
+        if network_volume_id is None or volume_mount_path is None:
+            raise RuntimeError(
+                'Network volume ID and mount path must be specified.')
     for _ in range(to_start_count):
         node_type = 'head' if head_instance_id is None else 'worker'
         try:
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
                 bid_per_gpu=config.node_config['BidPerGPU'],
                 docker_login_config=config.provider_config.get(
                     'docker_login_config'),
+                network_volume_id=network_volume_id,
+                volume_mount_path=volume_mount_path,
             )
         except Exception as e:  # pylint: disable=broad-except
             logger.warning(f'run_instances error: {e}')

sky/provision/runpod/utils.py CHANGED Viewed

@@ -263,11 +263,23 @@ def _create_template_for_docker_login(
     return login_config.format_image(image_name), create_template_resp['id']
-def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
-           zone: str, disk_size: int, image_name: str,
-           ports: Optional[List[int]], public_key: str,
-           preemptible: Optional[bool], bid_per_gpu: float,
-           docker_login_config: Optional[Dict[str, str]]) -> str:
+def launch(
+    cluster_name: str,
+    node_type: str,
+    instance_type: str,
+    region: str,
+    zone: str,
+    disk_size: int,
+    image_name: str,
+    ports: Optional[List[int]],
+    public_key: str,
+    preemptible: Optional[bool],
+    bid_per_gpu: float,
+    docker_login_config: Optional[Dict[str, str]],
+    *,
+    network_volume_id: Optional[str] = None,
+    volume_mount_path: Optional[str] = None,
+) -> str:
     """Launches an instance with the given parameters.
     For CPU instances, we directly use the instance_type for launching the
@@ -337,6 +349,12 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
         'template_id': template_id,
     }
+    # Optional network volume mount.
+    if volume_mount_path is not None:
+        params['volume_mount_path'] = volume_mount_path
+    if network_volume_id is not None:
+        params['network_volume_id'] = network_volume_id
     # GPU instance types start with f'{gpu_count}x',
     # CPU instance types start with 'cpu'.
     is_cpu_instance = instance_type.startswith('cpu')

sky/provision/runpod/volume.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""RunPod network volume provisioning."""
+from typing import Any, Dict, List, Optional, Tuple
+from sky import global_user_state
+from sky import models
+from sky import sky_logging
+from sky.adaptors import runpod
+from sky.utils import common_utils
+from sky.utils import volume as volume_lib
+logger = sky_logging.init_logger(__name__)
+def _list_volumes() -> List[Dict[str, Any]]:
+    # GET /v1/networkvolumes returns a list
+    result = runpod.rest_request('GET', '/networkvolumes')
+    if isinstance(result, list):
+        return result
+    # Some deployments may wrap the list.
+    if isinstance(result, dict):
+        for key in ('items', 'data', 'networkVolumes'):
+            if key in result and isinstance(result[key], list):
+                return result[key]
+    return []
+def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
+    """Create or resolve a RunPod network volume via REST API.
+    If a volume with the same `name_on_cloud` exists, reuse it. Otherwise,
+    create a new one using POST /v1/networkvolumes.
+    """
+    name_on_cloud = config.name_on_cloud
+    assert name_on_cloud is not None
+    vol_id = _try_resolve_volume_id(name_on_cloud)
+    if vol_id is None:
+        # Create new volume via REST
+        size = config.size
+        if size is None:
+            raise RuntimeError(
+                'RunPod network volume size must be specified to create '
+                'a volume.')
+        try:
+            size_int = int(size)
+            if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
+                raise RuntimeError(
+                    f'RunPod network volume size must be at least '
+                    f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
+        except Exception as e:  # pylint: disable=broad-except
+            raise RuntimeError(f'Invalid volume size {size!r}: {e}') from e
+        data_center_id = config.zone
+        if not data_center_id:
+            raise RuntimeError(
+                'RunPod DataCenterId is required to create a network '
+                'volume. Set the zone in the infra field.')
+        payload = {
+            'dataCenterId': data_center_id,
+            'name': name_on_cloud,
+            'size': size_int,
+        }
+        resp = runpod.rest_request('POST', '/networkvolumes', json=payload)
+        if isinstance(resp, dict):
+            config.id_on_cloud = resp.get('id')
+        else:
+            raise RuntimeError(
+                f'Failed to create RunPod network volume: {resp}')
+        logger.info(f'Created RunPod network volume {name_on_cloud} '
+                    f'(id={config.id_on_cloud})')
+        return config
+    # Use existing matched volume
+    config.id_on_cloud = vol_id
+    logger.debug(f'Using existing RunPod network volume {name_on_cloud} '
+                 f'(id={config.id_on_cloud})')
+    return config
+def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
+    """Deletes a RunPod network volume via REST API if id is known or
+       resolvable. If the volume id is not known, try to resolve it by name.
+    """
+    name_on_cloud = config.name_on_cloud
+    vol_id = config.id_on_cloud
+    if not vol_id:
+        vol_id = _try_resolve_volume_id(name_on_cloud)
+    if not vol_id:
+        logger.warning(
+            f'RunPod network volume id not found for {name_on_cloud}; '
+            f'skip delete')
+        return config
+    runpod.rest_request('DELETE', f'/networkvolumes/{vol_id}')
+    logger.info(f'Deleted RunPod network volume {name_on_cloud} '
+                f'(id={vol_id})')
+    return config
+def _try_resolve_volume_id(name_on_cloud: str) -> Optional[str]:
+    vols = _list_volumes()
+    matched = next((v for v in vols if v.get('name') == name_on_cloud), None)
+    if matched is not None:
+        return matched.get('id')
+    return None
+def get_volume_usedby(
+    config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
+    """Gets the clusters currently using this RunPod network volume.
+    Returns:
+      (usedby_pods, usedby_clusters)
+    usedby_clusters contains SkyPilot cluster display names inferred from
+      pod names, which may be wrong.
+    """
+    vol_id = config.id_on_cloud
+    name_on_cloud = config.name_on_cloud
+    if vol_id is None:
+        vol_id = _try_resolve_volume_id(name_on_cloud)
+    if vol_id is None:
+        return [], []
+    # Query all pods for current user and filter by networkVolumeId
+    query = """
+    query Pods {
+      myself {
+        pods {
+          id
+          name
+          networkVolumeId
+        }
+      }
+    }
+    """
+    resp = runpod.runpod.api.graphql.run_graphql_query(query)
+    pods = resp.get('data', {}).get('myself', {}).get('pods', [])
+    used_pods = [p for p in pods if p.get('networkVolumeId') == vol_id]
+    usedby_pod_names = [p.get('name') for p in used_pods if p.get('name')]
+    # Map pod names back to SkyPilot cluster names using heuristics.
+    clusters = global_user_state.get_clusters()
+    cluster_names: List[str] = []
+    user_hash = common_utils.get_user_hash()
+    for pod_name in usedby_pod_names:
+        matched = None
+        for c in clusters:
+            display = c.get('name')
+            if not display:
+                continue
+            # Heuristic: RunPod pod name is f"{cluster}-{user_hash}-{xxx}"
+            # This can be wrong.
+            cluster_prefix = display + '-' + user_hash + '-'
+            if pod_name.startswith(cluster_prefix):
+                matched = display
+                break
+        if matched and matched not in cluster_names:
+            cluster_names.append(matched)
+    return usedby_pod_names, cluster_names

sky/server/requests/payloads.py CHANGED Viewed

@@ -309,7 +309,8 @@ class StatusBody(RequestBody):
     cluster_names: Optional[List[str]] = None
     refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
     all_users: bool = True
-    include_credentials: bool = False
+    # TODO (kyuds): default to False post 0.10.5
+    include_credentials: bool = True
 class StartBody(RequestBody):
@@ -464,6 +465,11 @@ class VolumeDeleteBody(RequestBody):
     names: List[str]
+class VolumeListBody(RequestBody):
+    """The request body for the volume list endpoint."""
+    pass
 class EndpointsBody(RequestBody):
     """The request body for the endpoint."""
     cluster: str

sky/server/requests/preconditions.py CHANGED Viewed

@@ -162,13 +162,14 @@ class ClusterStartCompletePrecondition(Precondition):
         # We unify these situations into a single state: the process of starting
         # the cluster is done (either normally or abnormally) but cluster is not
         # in UP status.
-        requests = api_requests.get_request_tasks(
-            status=[
-                api_requests.RequestStatus.RUNNING,
-                api_requests.RequestStatus.PENDING
-            ],
-            include_request_names=['sky.launch', 'sky.start'],
-            cluster_names=[self.cluster_name])
+        requests = await api_requests.get_request_tasks_async(
+            req_filter=api_requests.RequestTaskFilter(
+                status=[
+                    api_requests.RequestStatus.RUNNING,
+                    api_requests.RequestStatus.PENDING
+                ],
+                include_request_names=['sky.launch', 'sky.start'],
+                cluster_names=[self.cluster_name]))
         if len(requests) == 0:
             # No running or pending tasks, the start process is done.
             return True, None

sky/server/requests/requests.py CHANGED Viewed

@@ -14,7 +14,7 @@ import threading
 import time
 import traceback
 from typing import (Any, AsyncContextManager, Callable, Dict, Generator, List,
-                    Optional, Tuple)
+                    NamedTuple, Optional, Tuple)
 import colorama
 import filelock
@@ -300,10 +300,11 @@ def kill_cluster_requests(cluster_name: str, exclude_request_name: str):
             prevent killing the caller request.
     """
     request_ids = [
-        request_task.request_id for request_task in get_request_tasks(
+        request_task.request_id
+        for request_task in get_request_tasks(req_filter=RequestTaskFilter(
             cluster_names=[cluster_name],
             status=[RequestStatus.PENDING, RequestStatus.RUNNING],
-            exclude_request_names=[exclude_request_name])
+            exclude_request_names=[exclude_request_name]))
     ]
     kill_requests(request_ids)
@@ -323,11 +324,12 @@ def kill_requests(request_ids: Optional[List[str]] = None,
     """
     if request_ids is None:
         request_ids = [
-            request_task.request_id for request_task in get_request_tasks(
+            request_task.request_id
+            for request_task in get_request_tasks(req_filter=RequestTaskFilter(
                 user_id=user_id,
                 status=[RequestStatus.RUNNING, RequestStatus.PENDING],
                 # Avoid cancelling the cancel request itself.
-                exclude_request_names=['sky.api_cancel'])
+                exclude_request_names=['sky.api_cancel']))
         ]
     cancelled_request_ids = []
     for request_id in request_ids:
@@ -548,6 +550,40 @@ async def get_request_async(request_id: str) -> Optional[Request]:
         return await _get_request_no_lock_async(request_id)
+class StatusWithMsg(NamedTuple):
+    status: RequestStatus
+    status_msg: Optional[str] = None
+@init_db_async
+@metrics_lib.time_me_async
+async def get_request_status_async(
+    request_id: str,
+    include_msg: bool = False,
+) -> Optional[StatusWithMsg]:
+    """Get the status of a request.
+    Args:
+        request_id: The ID of the request.
+        include_msg: Whether to include the status message.
+    Returns:
+        The status of the request. If the request is not found, returns
+        None.
+    """
+    assert _DB is not None
+    columns = 'status'
+    if include_msg:
+        columns += ', status_msg'
+    sql = f'SELECT {columns} FROM {REQUEST_TABLE} WHERE request_id LIKE ?'
+    async with _DB.execute_fetchall_async(sql, (request_id + '%',)) as rows:
+        if rows is None or len(rows) == 0:
+            return None
+        status = RequestStatus(rows[0][0])
+        status_msg = rows[0][1] if include_msg else None
+        return StatusWithMsg(status, status_msg)
 @init_db
 @metrics_lib.time_me
 def create_if_not_exists(request: Request) -> bool:
@@ -570,17 +606,9 @@ async def create_if_not_exists_async(request: Request) -> bool:
         return True
-@init_db
-@metrics_lib.time_me
-def get_request_tasks(
-    status: Optional[List[RequestStatus]] = None,
-    cluster_names: Optional[List[str]] = None,
-    user_id: Optional[str] = None,
-    exclude_request_names: Optional[List[str]] = None,
-    include_request_names: Optional[List[str]] = None,
-    finished_before: Optional[float] = None,
-) -> List[Request]:
-    """Get a list of requests that match the given filters.
+@dataclasses.dataclass
+class RequestTaskFilter:
+    """Filter for requests.
     Args:
         status: a list of statuses of the requests to filter on.
@@ -598,51 +626,87 @@ def get_request_tasks(
         ValueError: If both exclude_request_names and include_request_names are
             provided.
     """
-    if exclude_request_names is not None and include_request_names is not None:
-        raise ValueError(
-            'Only one of exclude_request_names or include_request_names can be '
-            'provided, not both.')
-    filters = []
-    filter_params: List[Any] = []
-    if status is not None:
-        status_list_str = ','.join(repr(status.value) for status in status)
-        filters.append(f'status IN ({status_list_str})')
-    if exclude_request_names is not None:
-        exclude_request_names_str = ','.join(
-            repr(name) for name in exclude_request_names)
-        filters.append(f'name NOT IN ({exclude_request_names_str})')
-    if cluster_names is not None:
-        cluster_names_str = ','.join(repr(name) for name in cluster_names)
-        filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
-    if user_id is not None:
-        filters.append(f'{COL_USER_ID} = ?')
-        filter_params.append(user_id)
-    if include_request_names is not None:
-        request_names_str = ','.join(
-            repr(name) for name in include_request_names)
-        filters.append(f'name IN ({request_names_str})')
-    if finished_before is not None:
-        filters.append('finished_at < ?')
-        filter_params.append(finished_before)
-    assert _DB is not None
-    with _DB.conn:
-        cursor = _DB.conn.cursor()
+    status: Optional[List[RequestStatus]] = None
+    cluster_names: Optional[List[str]] = None
+    user_id: Optional[str] = None
+    exclude_request_names: Optional[List[str]] = None
+    include_request_names: Optional[List[str]] = None
+    finished_before: Optional[float] = None
+    def __post_init__(self):
+        if (self.exclude_request_names is not None and
+                self.include_request_names is not None):
+            raise ValueError(
+                'Only one of exclude_request_names or include_request_names '
+                'can be provided, not both.')
+    def build_query(self) -> Tuple[str, List[Any]]:
+        """Build the SQL query and filter parameters.
+        Returns:
+            A tuple of (SQL, SQL parameters).
+        """
+        filters = []
+        filter_params: List[Any] = []
+        if self.status is not None:
+            status_list_str = ','.join(
+                repr(status.value) for status in self.status)
+            filters.append(f'status IN ({status_list_str})')
+        if self.exclude_request_names is not None:
+            exclude_request_names_str = ','.join(
+                repr(name) for name in self.exclude_request_names)
+            filters.append(f'name NOT IN ({exclude_request_names_str})')
+        if self.cluster_names is not None:
+            cluster_names_str = ','.join(
+                repr(name) for name in self.cluster_names)
+            filters.append(f'{COL_CLUSTER_NAME} IN ({cluster_names_str})')
+        if self.user_id is not None:
+            filters.append(f'{COL_USER_ID} = ?')
+            filter_params.append(self.user_id)
+        if self.include_request_names is not None:
+            request_names_str = ','.join(
+                repr(name) for name in self.include_request_names)
+            filters.append(f'name IN ({request_names_str})')
+        if self.finished_before is not None:
+            filters.append('finished_at < ?')
+            filter_params.append(self.finished_before)
         filter_str = ' AND '.join(filters)
         if filter_str:
             filter_str = f' WHERE {filter_str}'
         columns_str = ', '.join(REQUEST_COLUMNS)
-        cursor.execute(
-            f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
-            'ORDER BY created_at DESC', filter_params)
+        return (f'SELECT {columns_str} FROM {REQUEST_TABLE}{filter_str} '
+                'ORDER BY created_at DESC'), filter_params
+@init_db
+@metrics_lib.time_me
+def get_request_tasks(req_filter: RequestTaskFilter) -> List[Request]:
+    """Get a list of requests that match the given filters.
+    Args:
+        req_filter: the filter to apply to the requests. Refer to
+            RequestTaskFilter for the details.
+    """
+    assert _DB is not None
+    with _DB.conn:
+        cursor = _DB.conn.cursor()
+        cursor.execute(*req_filter.build_query())
         rows = cursor.fetchall()
         if rows is None:
             return []
-    requests = []
-    for row in rows:
-        request = Request.from_row(row)
-        requests.append(request)
-    return requests
+    return [Request.from_row(row) for row in rows]
+@init_db_async
+@metrics_lib.time_me_async
+async def get_request_tasks_async(
+        req_filter: RequestTaskFilter) -> List[Request]:
+    """Async version of get_request_tasks."""
+    assert _DB is not None
+    async with _DB.execute_fetchall_async(*req_filter.build_query()) as rows:
+        if not rows:
+            return []
+    return [Request.from_row(row) for row in rows]
 @init_db_async
@@ -739,8 +803,10 @@ def clean_finished_requests_with_retention(retention_seconds: int):
         retention_seconds: Requests older than this many seconds will be
             deleted.
     """
-    reqs = get_request_tasks(status=RequestStatus.finished_status(),
-                             finished_before=time.time() - retention_seconds)
+    reqs = get_request_tasks(
+        req_filter=RequestTaskFilter(status=RequestStatus.finished_status(),
+                                     finished_before=time.time() -
+                                     retention_seconds))
     subprocess_utils.run_in_parallel(
         func=lambda req: req.log_path.unlink(missing_ok=True),
@@ -767,7 +833,7 @@ async def requests_gc_daemon():
         try:
             # Negative value disables the requests GC
             if retention_seconds >= 0:
-                clean_finished_requests_with_retention(retention_seconds)
+                await clean_finished_requests_with_retention(retention_seconds)
         except asyncio.CancelledError:
             logger.info('Requests GC daemon cancelled')
             break

skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250903__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250902py3-none-any.whl → 1.0.0.dev20250903py3-none-any.whl