PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250904__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250902py3-none-any.whl → 1.0.0.dev20250904py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (95) hide show

sky/global_user_state.py CHANGED Viewed

@@ -308,7 +308,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
         if _SQLALCHEMY_ENGINE is not None:
             return _SQLALCHEMY_ENGINE
         # get an engine to the db
-        engine = migration_utils.get_engine('state')
+        engine = db_utils.get_engine('state')
         # run migrations if needed
         create_table(engine)
@@ -2312,3 +2312,18 @@ def set_system_config(config_key: str, config_value: str) -> None:
             })
         session.execute(upsert_stmnt)
         session.commit()
+@_init_db
+def get_max_db_connections() -> Optional[int]:
+    """Get the maximum number of connections for the engine."""
+    assert _SQLALCHEMY_ENGINE is not None
+    if (_SQLALCHEMY_ENGINE.dialect.name ==
+            db_utils.SQLAlchemyDialect.SQLITE.value):
+        return None
+    with sqlalchemy.orm.Session(_SQLALCHEMY_ENGINE) as session:
+        max_connections = session.execute(
+            sqlalchemy.text('SHOW max_connections')).scalar()
+        if max_connections is None:
+            return None
+        return int(max_connections)

sky/jobs/state.py CHANGED Viewed

@@ -157,7 +157,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
         if _SQLALCHEMY_ENGINE is not None:
             return _SQLALCHEMY_ENGINE
         # get an engine to the db
-        engine = migration_utils.get_engine('spot_jobs')
+        engine = db_utils.get_engine('spot_jobs')
         # run migrations if needed
         create_table(engine)

sky/models.py CHANGED Viewed

@@ -109,3 +109,4 @@ class VolumeConfig(pydantic.BaseModel):
     size: Optional[str]
     config: Dict[str, Any] = {}
     labels: Optional[Dict[str, str]] = None
+    id_on_cloud: Optional[str] = None

sky/provision/kubernetes/instance.py CHANGED Viewed

@@ -1047,8 +1047,10 @@ def stop_instances(
     raise NotImplementedError()
-def _delete_services(name_prefix: str, namespace: str,
-                     context: Optional[str]) -> None:
+def _delete_services(name_prefix: str,
+                     namespace: str,
+                     context: Optional[str],
+                     skip_ssh_service: bool = False) -> None:
     """Delete services with the given name prefix.
     Args:
@@ -1057,7 +1059,9 @@ def _delete_services(name_prefix: str, namespace: str,
         context: Kubernetes context
     """
     # TODO(andy): We should use tag for the service filter.
-    for service_name in [name_prefix, f'{name_prefix}-ssh']:
+    services = ([name_prefix, f'{name_prefix}-ssh']
+                if not skip_ssh_service else [name_prefix])
+    for service_name in services:
         # Since we are not saving this lambda, it's a false positive.
         # TODO(andyl): Wait for
         # https://github.com/pylint-dev/pylint/issues/5263.
@@ -1083,6 +1087,9 @@ def _terminate_node(namespace: str,
         # Delete services for the head pod
         # services are specified in sky/templates/kubernetes-ray.yml.j2
         _delete_services(pod_name, namespace, context)
+    else:
+        # No ssh service is created for worker pods
+        _delete_services(pod_name, namespace, context, skip_ssh_service=True)
     # Note - delete pod after all other resources are deleted.
     # This is to ensure there are no leftover resources if this down is run

sky/provision/runpod/__init__.py CHANGED Viewed

@@ -9,3 +9,6 @@ from sky.provision.runpod.instance import run_instances
 from sky.provision.runpod.instance import stop_instances
 from sky.provision.runpod.instance import terminate_instances
 from sky.provision.runpod.instance import wait_instances
+from sky.provision.runpod.volume import apply_volume
+from sky.provision.runpod.volume import delete_volume
+from sky.provision.runpod.volume import get_volume_usedby

sky/provision/runpod/instance.py CHANGED Viewed

@@ -80,6 +80,21 @@ def run_instances(region: str, cluster_name_on_cloud: str,
             created_instance_ids=[])
     created_instance_ids = []
+    volume_mounts = config.node_config.get('VolumeMounts', [])
+    network_volume_id = None
+    volume_mount_path = None
+    if volume_mounts:
+        if len(volume_mounts) > 1:
+            logger.warning(
+                f'RunPod only supports one network volume mount, '
+                f'but {len(volume_mounts)} are specified. Only the first one '
+                f'will be used.')
+        volume_mount = volume_mounts[0]
+        network_volume_id = volume_mount.get('VolumeIdOnCloud')
+        volume_mount_path = volume_mount.get('MountPath')
+        if network_volume_id is None or volume_mount_path is None:
+            raise RuntimeError(
+                'Network volume ID and mount path must be specified.')
     for _ in range(to_start_count):
         node_type = 'head' if head_instance_id is None else 'worker'
         try:
@@ -97,6 +112,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
                 bid_per_gpu=config.node_config['BidPerGPU'],
                 docker_login_config=config.provider_config.get(
                     'docker_login_config'),
+                network_volume_id=network_volume_id,
+                volume_mount_path=volume_mount_path,
             )
         except Exception as e:  # pylint: disable=broad-except
             logger.warning(f'run_instances error: {e}')

sky/provision/runpod/utils.py CHANGED Viewed

@@ -263,11 +263,23 @@ def _create_template_for_docker_login(
     return login_config.format_image(image_name), create_template_resp['id']
-def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
-           zone: str, disk_size: int, image_name: str,
-           ports: Optional[List[int]], public_key: str,
-           preemptible: Optional[bool], bid_per_gpu: float,
-           docker_login_config: Optional[Dict[str, str]]) -> str:
+def launch(
+    cluster_name: str,
+    node_type: str,
+    instance_type: str,
+    region: str,
+    zone: str,
+    disk_size: int,
+    image_name: str,
+    ports: Optional[List[int]],
+    public_key: str,
+    preemptible: Optional[bool],
+    bid_per_gpu: float,
+    docker_login_config: Optional[Dict[str, str]],
+    *,
+    network_volume_id: Optional[str] = None,
+    volume_mount_path: Optional[str] = None,
+) -> str:
     """Launches an instance with the given parameters.
     For CPU instances, we directly use the instance_type for launching the
@@ -337,6 +349,12 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
         'template_id': template_id,
     }
+    # Optional network volume mount.
+    if volume_mount_path is not None:
+        params['volume_mount_path'] = volume_mount_path
+    if network_volume_id is not None:
+        params['network_volume_id'] = network_volume_id
     # GPU instance types start with f'{gpu_count}x',
     # CPU instance types start with 'cpu'.
     is_cpu_instance = instance_type.startswith('cpu')

sky/provision/runpod/volume.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""RunPod network volume provisioning."""
+from typing import Any, Dict, List, Optional, Tuple
+from sky import global_user_state
+from sky import models
+from sky import sky_logging
+from sky.adaptors import runpod
+from sky.utils import common_utils
+from sky.utils import volume as volume_lib
+logger = sky_logging.init_logger(__name__)
+def _list_volumes() -> List[Dict[str, Any]]:
+    # GET /v1/networkvolumes returns a list
+    result = runpod.rest_request('GET', '/networkvolumes')
+    if isinstance(result, list):
+        return result
+    # Some deployments may wrap the list.
+    if isinstance(result, dict):
+        for key in ('items', 'data', 'networkVolumes'):
+            if key in result and isinstance(result[key], list):
+                return result[key]
+    return []
+def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
+    """Create or resolve a RunPod network volume via REST API.
+    If a volume with the same `name_on_cloud` exists, reuse it. Otherwise,
+    create a new one using POST /v1/networkvolumes.
+    """
+    name_on_cloud = config.name_on_cloud
+    assert name_on_cloud is not None
+    vol_id = _try_resolve_volume_id(name_on_cloud)
+    if vol_id is None:
+        # Create new volume via REST
+        size = config.size
+        if size is None:
+            raise RuntimeError(
+                'RunPod network volume size must be specified to create '
+                'a volume.')
+        try:
+            size_int = int(size)
+            if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
+                raise RuntimeError(
+                    f'RunPod network volume size must be at least '
+                    f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
+        except Exception as e:  # pylint: disable=broad-except
+            raise RuntimeError(f'Invalid volume size {size!r}: {e}') from e
+        data_center_id = config.zone
+        if not data_center_id:
+            raise RuntimeError(
+                'RunPod DataCenterId is required to create a network '
+                'volume. Set the zone in the infra field.')
+        payload = {
+            'dataCenterId': data_center_id,
+            'name': name_on_cloud,
+            'size': size_int,
+        }
+        resp = runpod.rest_request('POST', '/networkvolumes', json=payload)
+        if isinstance(resp, dict):
+            config.id_on_cloud = resp.get('id')
+        else:
+            raise RuntimeError(
+                f'Failed to create RunPod network volume: {resp}')
+        logger.info(f'Created RunPod network volume {name_on_cloud} '
+                    f'(id={config.id_on_cloud})')
+        return config
+    # Use existing matched volume
+    config.id_on_cloud = vol_id
+    logger.debug(f'Using existing RunPod network volume {name_on_cloud} '
+                 f'(id={config.id_on_cloud})')
+    return config
+def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
+    """Deletes a RunPod network volume via REST API if id is known or
+       resolvable. If the volume id is not known, try to resolve it by name.
+    """
+    name_on_cloud = config.name_on_cloud
+    vol_id = config.id_on_cloud
+    if not vol_id:
+        vol_id = _try_resolve_volume_id(name_on_cloud)
+    if not vol_id:
+        logger.warning(
+            f'RunPod network volume id not found for {name_on_cloud}; '
+            f'skip delete')
+        return config
+    runpod.rest_request('DELETE', f'/networkvolumes/{vol_id}')
+    logger.info(f'Deleted RunPod network volume {name_on_cloud} '
+                f'(id={vol_id})')
+    return config
+def _try_resolve_volume_id(name_on_cloud: str) -> Optional[str]:
+    vols = _list_volumes()
+    matched = next((v for v in vols if v.get('name') == name_on_cloud), None)
+    if matched is not None:
+        return matched.get('id')
+    return None
+def get_volume_usedby(
+    config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
+    """Gets the clusters currently using this RunPod network volume.
+    Returns:
+      (usedby_pods, usedby_clusters)
+    usedby_clusters contains SkyPilot cluster display names inferred from
+      pod names, which may be wrong.
+    """
+    vol_id = config.id_on_cloud
+    name_on_cloud = config.name_on_cloud
+    if vol_id is None:
+        vol_id = _try_resolve_volume_id(name_on_cloud)
+    if vol_id is None:
+        return [], []
+    # Query all pods for current user and filter by networkVolumeId
+    query = """
+    query Pods {
+      myself {
+        pods {
+          id
+          name
+          networkVolumeId
+        }
+      }
+    }
+    """
+    resp = runpod.runpod.api.graphql.run_graphql_query(query)
+    pods = resp.get('data', {}).get('myself', {}).get('pods', [])
+    used_pods = [p for p in pods if p.get('networkVolumeId') == vol_id]
+    usedby_pod_names = [p.get('name') for p in used_pods if p.get('name')]
+    # Map pod names back to SkyPilot cluster names using heuristics.
+    clusters = global_user_state.get_clusters()
+    cluster_names: List[str] = []
+    user_hash = common_utils.get_user_hash()
+    for pod_name in usedby_pod_names:
+        matched = None
+        for c in clusters:
+            display = c.get('name')
+            if not display:
+                continue
+            # Heuristic: RunPod pod name is f"{cluster}-{user_hash}-{xxx}"
+            # This can be wrong.
+            cluster_prefix = display + '-' + user_hash + '-'
+            if pod_name.startswith(cluster_prefix):
+                matched = display
+                break
+        if matched and matched not in cluster_names:
+            cluster_names.append(matched)
+    return usedby_pod_names, cluster_names

sky/serve/serve_state.py CHANGED Viewed

@@ -130,7 +130,7 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
         if _SQLALCHEMY_ENGINE is not None:
             return _SQLALCHEMY_ENGINE
         # get an engine to the db
-        engine = migration_utils.get_engine('serve/services')
+        engine = db_utils.get_engine('serve/services')
         # run migrations if needed
         create_table(engine)

sky/server/config.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import dataclasses
 import enum
+from typing import Optional
 from sky import sky_logging
 from sky.server import constants as server_constants
@@ -61,6 +62,7 @@ class QueueBackend(enum.Enum):
 class WorkerConfig:
     garanteed_parallelism: int
     burstable_parallelism: int
+    num_db_connections_per_worker: int
 @dataclasses.dataclass
@@ -68,10 +70,13 @@ class ServerConfig:
     num_server_workers: int
     long_worker_config: WorkerConfig
     short_worker_config: WorkerConfig
+    num_db_connections_per_worker: int
     queue_backend: QueueBackend
-def compute_server_config(deploy: bool) -> ServerConfig:
+def compute_server_config(deploy: bool,
+                          max_db_connections: Optional[int] = None
+                         ) -> ServerConfig:
     """Compute the server config based on environment.
     We have different assumptions for the resources in different deployment
@@ -114,7 +119,17 @@ def compute_server_config(deploy: bool) -> ServerConfig:
     queue_backend = QueueBackend.MULTIPROCESSING
     burstable_parallel_for_long = 0
     burstable_parallel_for_short = 0
+    # if num_db_connections_per_worker is 0, server will use NullPool
+    # to conserve the number of concurrent db connections.
+    # This could lead to performance degradation.
+    num_db_connections_per_worker = 0
     num_server_workers = cpu_count
+    # +1 for the event loop running the main process
+    # and gc daemons in the '__main__' body of sky/server/server.py
+    max_parallel_all_workers = (max_parallel_for_long + max_parallel_for_short +
+                                num_server_workers + 1)
     if not deploy:
         # For local mode, use local queue backend since we only run 1 uvicorn
         # worker in local mode and no multiprocessing is needed.
@@ -140,6 +155,16 @@ def compute_server_config(deploy: bool) -> ServerConfig:
                 'SkyPilot API server will run in low resource mode because '
                 'the available memory is less than '
                 f'{server_constants.MIN_AVAIL_MEM_GB}GB.')
+    elif max_db_connections is not None:
+        if max_parallel_all_workers > max_db_connections:
+            logger.warning(
+                f'Max parallel all workers ({max_parallel_all_workers}) '
+                f'is greater than max db connections ({max_db_connections}). '
+                'Increase the number of max db connections to '
+                f'at least {max_parallel_all_workers} for optimal performance.')
+        else:
+            num_db_connections_per_worker = 1
     logger.info(
         f'SkyPilot API server will start {num_server_workers} server processes '
         f'with {max_parallel_for_long} background workers for long requests '
@@ -150,10 +175,13 @@ def compute_server_config(deploy: bool) -> ServerConfig:
         queue_backend=queue_backend,
         long_worker_config=WorkerConfig(
             garanteed_parallelism=max_parallel_for_long,
-            burstable_parallelism=burstable_parallel_for_long),
+            burstable_parallelism=burstable_parallel_for_long,
+            num_db_connections_per_worker=num_db_connections_per_worker),
         short_worker_config=WorkerConfig(
             garanteed_parallelism=max_parallel_for_short,
-            burstable_parallelism=burstable_parallel_for_short),
+            burstable_parallelism=burstable_parallel_for_short,
+            num_db_connections_per_worker=num_db_connections_per_worker),
+        num_db_connections_per_worker=num_db_connections_per_worker,
     )

sky/server/requests/executor.py CHANGED Viewed

@@ -57,6 +57,7 @@ from sky.utils import subprocess_utils
 from sky.utils import tempstore
 from sky.utils import timeline
 from sky.utils import yaml_utils
+from sky.utils.db import db_utils
 from sky.workspaces import core as workspaces_core
 if typing.TYPE_CHECKING:
@@ -152,6 +153,8 @@ class RequestWorker:
         self.schedule_type = schedule_type
         self.garanteed_parallelism = config.garanteed_parallelism
         self.burstable_parallelism = config.burstable_parallelism
+        self.num_db_connections_per_worker = (
+            config.num_db_connections_per_worker)
         self._thread: Optional[threading.Thread] = None
         self._cancel_event = threading.Event()
@@ -190,8 +193,9 @@ class RequestWorker:
             # multiple requests can share the same process pid, which may cause
             # issues with SkyPilot core functions if they rely on the exit of
             # the process, such as subprocess_daemon.py.
-            fut = executor.submit_until_success(_request_execution_wrapper,
-                                                request_id, ignore_return_value)
+            fut = executor.submit_until_success(
+                _request_execution_wrapper, request_id, ignore_return_value,
+                self.num_db_connections_per_worker)
             # Monitor the result of the request execution.
             threading.Thread(target=self.handle_task_result,
                              args=(fut, request_element),
@@ -351,7 +355,8 @@ def _sigterm_handler(signum: int, frame: Optional['types.FrameType']) -> None:
 def _request_execution_wrapper(request_id: str,
-                               ignore_return_value: bool) -> None:
+                               ignore_return_value: bool,
+                               num_db_connections_per_worker: int = 0) -> None:
     """Wrapper for a request execution.
     It wraps the execution of a request to:
@@ -362,6 +367,7 @@ def _request_execution_wrapper(request_id: str,
     4. Handle the SIGTERM signal to abort the request gracefully.
     5. Maintain the lifecycle of the temp dir used by the request.
     """
+    db_utils.set_max_connections(num_db_connections_per_worker)
     # Handle the SIGTERM signal to abort the request processing gracefully.
     signal.signal(signal.SIGTERM, _sigterm_handler)

sky/server/requests/payloads.py CHANGED Viewed

@@ -309,7 +309,8 @@ class StatusBody(RequestBody):
     cluster_names: Optional[List[str]] = None
     refresh: common_lib.StatusRefreshMode = common_lib.StatusRefreshMode.NONE
     all_users: bool = True
-    include_credentials: bool = False
+    # TODO (kyuds): default to False post 0.10.5
+    include_credentials: bool = True
 class StartBody(RequestBody):
@@ -464,6 +465,11 @@ class VolumeDeleteBody(RequestBody):
     names: List[str]
+class VolumeListBody(RequestBody):
+    """The request body for the volume list endpoint."""
+    pass
 class EndpointsBody(RequestBody):
     """The request body for the endpoint."""
     cluster: str

sky/server/requests/preconditions.py CHANGED Viewed

@@ -162,13 +162,14 @@ class ClusterStartCompletePrecondition(Precondition):
         # We unify these situations into a single state: the process of starting
         # the cluster is done (either normally or abnormally) but cluster is not
         # in UP status.
-        requests = api_requests.get_request_tasks(
-            status=[
-                api_requests.RequestStatus.RUNNING,
-                api_requests.RequestStatus.PENDING
-            ],
-            include_request_names=['sky.launch', 'sky.start'],
-            cluster_names=[self.cluster_name])
+        requests = await api_requests.get_request_tasks_async(
+            req_filter=api_requests.RequestTaskFilter(
+                status=[
+                    api_requests.RequestStatus.RUNNING,
+                    api_requests.RequestStatus.PENDING
+                ],
+                include_request_names=['sky.launch', 'sky.start'],
+                cluster_names=[self.cluster_name]))
         if len(requests) == 0:
             # No running or pending tasks, the start process is done.
             return True, None

skypilot-nightly 1.0.0.dev20250902__py3-none-any.whl → 1.0.0.dev20250904__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20250902py3-none-any.whl → 1.0.0.dev20250904py3-none-any.whl