PyPI - dstack - Versions diffs - 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl - Mend

dstack 0.19.34py3-none-any.whl → 0.19.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (41) hide show

dstack/_internal/cli/services/configurators/run.py +1 -1
dstack/_internal/core/backends/base/compute.py +20 -1
dstack/_internal/core/backends/base/models.py +10 -0
dstack/_internal/core/backends/base/offers.py +1 -0
dstack/_internal/core/backends/features.py +5 -0
dstack/_internal/core/backends/nebius/compute.py +28 -16
dstack/_internal/core/backends/nebius/configurator.py +1 -1
dstack/_internal/core/backends/nebius/models.py +4 -0
dstack/_internal/core/backends/nebius/resources.py +41 -20
dstack/_internal/core/backends/runpod/api_client.py +245 -59
dstack/_internal/core/backends/runpod/compute.py +157 -13
dstack/_internal/core/models/compute_groups.py +39 -0
dstack/_internal/core/models/fleets.py +6 -1
dstack/_internal/core/models/profiles.py +3 -1
dstack/_internal/core/models/runs.py +3 -0
dstack/_internal/server/app.py +14 -2
dstack/_internal/server/background/__init__.py +7 -0
dstack/_internal/server/background/tasks/process_compute_groups.py +164 -0
dstack/_internal/server/background/tasks/process_instances.py +81 -49
dstack/_internal/server/background/tasks/process_submitted_jobs.py +179 -84
dstack/_internal/server/migrations/env.py +20 -2
dstack/_internal/server/migrations/versions/7d1ec2b920ac_add_computegroupmodel.py +93 -0
dstack/_internal/server/models.py +39 -0
dstack/_internal/server/routers/runs.py +15 -6
dstack/_internal/server/services/compute_groups.py +22 -0
dstack/_internal/server/services/fleets.py +1 -0
dstack/_internal/server/services/jobs/__init__.py +13 -0
dstack/_internal/server/services/jobs/configurators/base.py +3 -2
dstack/_internal/server/services/requirements/combine.py +1 -0
dstack/_internal/server/services/runs.py +17 -3
dstack/_internal/server/testing/common.py +51 -0
dstack/_internal/server/utils/routers.py +18 -20
dstack/_internal/settings.py +4 -1
dstack/_internal/utils/version.py +22 -0
dstack/version.py +1 -1
{dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/METADATA +3 -3
{dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/RECORD +40 -36
dstack/_internal/core/backends/nebius/fabrics.py +0 -49
{dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/WHEEL +0 -0
{dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/entry_points.txt +0 -0
{dstack-0.19.34.dist-info → dstack-0.19.35.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/core/backends/runpod/api_client.py CHANGED Viewed

@@ -11,6 +11,14 @@ from dstack._internal.utils.common import get_current_datetime
 API_URL = "https://api.runpod.io/graphql"
+class RunpodApiClientError(BackendError):
+    errors: List[Dict]
+    def __init__(self, errors: List[Dict]):
+        self.errors = errors
+        super().__init__(errors)
 class RunpodApiClient:
     def __init__(self, api_key: str):
         self.api_key = api_key
@@ -23,7 +31,19 @@ class RunpodApiClient:
         return True
     def get_user_details(self) -> Dict:
-        resp = self._make_request({"query": user_details_query, "variable": {}})
+        resp = self._make_request(
+            {
+                "query": """
+                query myself {
+                    myself {
+                        id
+                        authId
+                        email
+                    }
+                }
+                """
+            }
+        )
         return resp.json()
     def create_pod(
@@ -52,28 +72,28 @@ class RunpodApiClient:
     ) -> Dict:
         resp = self._make_request(
             {
-                "query": generate_pod_deployment_mutation(
-                    name,
-                    image_name,
-                    gpu_type_id,
-                    cloud_type,
-                    support_public_ip,
-                    start_ssh,
-                    data_center_id,
-                    country_code,
-                    gpu_count,
-                    volume_in_gb,
-                    container_disk_in_gb,
-                    min_vcpu_count,
-                    min_memory_in_gb,
-                    docker_args,
-                    ports,
-                    volume_mount_path,
-                    env,
-                    template_id,
-                    network_volume_id,
-                    allowed_cuda_versions,
-                    bid_per_gpu,
+                "query": _generate_pod_deployment_mutation(
+                    name=name,
+                    image_name=image_name,
+                    gpu_type_id=gpu_type_id,
+                    cloud_type=cloud_type,
+                    support_public_ip=support_public_ip,
+                    start_ssh=start_ssh,
+                    data_center_id=data_center_id,
+                    country_code=country_code,
+                    gpu_count=gpu_count,
+                    volume_in_gb=volume_in_gb,
+                    container_disk_in_gb=container_disk_in_gb,
+                    min_vcpu_count=min_vcpu_count,
+                    min_memory_in_gb=min_memory_in_gb,
+                    docker_args=docker_args,
+                    ports=ports,
+                    volume_mount_path=volume_mount_path,
+                    env=env,
+                    template_id=template_id,
+                    network_volume_id=network_volume_id,
+                    allowed_cuda_versions=allowed_cuda_versions,
+                    bid_per_gpu=bid_per_gpu,
                 )
             }
         )
@@ -86,7 +106,9 @@ class RunpodApiClient:
         image_name: str,
         container_disk_in_gb: int,
         container_registry_auth_id: str,
-        volume_in_gb: int = 0,
+        # Default pod volume is 20GB.
+        # RunPod errors if it's not specified for podEditJob.
+        volume_in_gb: int = 20,
     ) -> str:
         resp = self._make_request(
             {
@@ -108,12 +130,12 @@ class RunpodApiClient:
         return resp.json()["data"]["podEditJob"]["id"]
     def get_pod(self, pod_id: str) -> Dict:
-        resp = self._make_request({"query": generate_pod_query(pod_id)})
+        resp = self._make_request({"query": _generate_pod_query(pod_id)})
         data = resp.json()
         return data["data"]["pod"]
     def terminate_pod(self, pod_id: str) -> Dict:
-        resp = self._make_request({"query": generate_pod_terminate_mutation(pod_id)})
+        resp = self._make_request({"query": _generate_pod_terminate_mutation(pod_id)})
         data = resp.json()
         return data["data"]
@@ -213,7 +235,7 @@ class RunpodApiClient:
         )
         return response.json()["data"]["createNetworkVolume"]["id"]
-    def delete_network_volume(self, volume_id: str):
+    def delete_network_volume(self, volume_id: str) -> None:
         self._make_request(
             {
                 "query": f"""
@@ -228,7 +250,66 @@ class RunpodApiClient:
             }
         )
-    def _make_request(self, data: Any = None) -> Response:
+    def create_cluster(
+        self,
+        cluster_name: str,
+        gpu_type_id: str,
+        pod_count: int,
+        gpu_count_per_pod: int,
+        image_name: str,
+        deploy_cost: str,
+        template_id: Optional[str] = None,
+        cluster_type: str = "TRAINING",
+        network_volume_id: Optional[str] = None,
+        volume_in_gb: Optional[int] = None,
+        throughput: Optional[int] = None,
+        allowed_cuda_versions: Optional[List[str]] = None,
+        volume_key: Optional[str] = None,
+        data_center_id: Optional[str] = None,
+        start_jupyter: bool = False,
+        start_ssh: bool = False,
+        container_disk_in_gb: Optional[int] = None,
+        docker_args: Optional[str] = None,
+        env: Optional[Dict[str, Any]] = None,
+        volume_mount_path: Optional[str] = None,
+        ports: Optional[str] = None,
+    ) -> Dict:
+        resp = self._make_request(
+            {
+                "query": _generate_create_cluster_mutation(
+                    cluster_name=cluster_name,
+                    gpu_type_id=gpu_type_id,
+                    pod_count=pod_count,
+                    gpu_count_per_pod=gpu_count_per_pod,
+                    image_name=image_name,
+                    cluster_type=cluster_type,
+                    deploy_cost=deploy_cost,
+                    template_id=template_id,
+                    network_volume_id=network_volume_id,
+                    volume_in_gb=volume_in_gb,
+                    throughput=throughput,
+                    allowed_cuda_versions=allowed_cuda_versions,
+                    volume_key=volume_key,
+                    data_center_id=data_center_id,
+                    start_jupyter=start_jupyter,
+                    start_ssh=start_ssh,
+                    container_disk_in_gb=container_disk_in_gb,
+                    docker_args=docker_args,
+                    env=env,
+                    volume_mount_path=volume_mount_path,
+                    ports=ports,
+                )
+            }
+        )
+        data = resp.json()["data"]
+        return data["createCluster"]
+    def delete_cluster(self, cluster_id: str) -> bool:
+        resp = self._make_request({"query": _generate_delete_cluster_mutation(cluster_id)})
+        data = resp.json()["data"]
+        return data["deleteCluster"]
+    def _make_request(self, data: Optional[Dict[str, Any]] = None) -> Response:
         try:
             response = requests.request(
                 method="POST",
@@ -237,10 +318,10 @@ class RunpodApiClient:
                 timeout=120,
             )
             response.raise_for_status()
-            if "errors" in response.json():
-                if "podTerminate" in response.json()["errors"][0]["path"]:
-                    raise BackendError("Instance Not Found")
-                raise BackendError(response.json()["errors"][0]["message"])
+            response_json = response.json()
+            # RunPod returns 200 on client errors
+            if "errors" in response_json:
+                raise RunpodApiClientError(errors=response_json["errors"])
             return response
         except requests.HTTPError as e:
             if e.response is not None and e.response.status_code in (
@@ -250,7 +331,7 @@ class RunpodApiClient:
                 raise BackendInvalidCredentialsError(e.response.text)
             raise
-    def wait_for_instance(self, instance_id) -> Optional[Dict]:
+    def wait_for_instance(self, instance_id: str) -> Optional[Dict]:
         start = get_current_datetime()
         wait_for_instance_interval = 5
         # To change the status to "running," the image must be pulled and then started.
@@ -263,18 +344,7 @@ class RunpodApiClient:
         return
-user_details_query = """
-query myself {
-    myself {
-        id
-        authId
-        email
-    }
-}
-"""
-def generate_pod_query(pod_id: str) -> str:
+def _generate_pod_query(pod_id: str) -> str:
     """
     Generate a query for a specific GPU type
     """
@@ -283,6 +353,7 @@ def generate_pod_query(pod_id: str) -> str:
     query pod {{
         pod(input: {{podId: "{pod_id}"}}) {{
             id
+            clusterIp
             containerDiskInGb
             costPerHr
             desiredStatus
@@ -319,26 +390,26 @@ def generate_pod_query(pod_id: str) -> str:
     """
-def generate_pod_deployment_mutation(
+def _generate_pod_deployment_mutation(
     name: str,
     image_name: str,
     gpu_type_id: str,
     cloud_type: str,
     support_public_ip: bool = True,
     start_ssh: bool = True,
-    data_center_id=None,
-    country_code=None,
-    gpu_count=None,
-    volume_in_gb=None,
-    container_disk_in_gb=None,
-    min_vcpu_count=None,
-    min_memory_in_gb=None,
-    docker_args=None,
-    ports=None,
-    volume_mount_path=None,
+    data_center_id: Optional[str] = None,
+    country_code: Optional[str] = None,
+    gpu_count: Optional[int] = None,
+    volume_in_gb: Optional[int] = None,
+    container_disk_in_gb: Optional[int] = None,
+    min_vcpu_count: Optional[int] = None,
+    min_memory_in_gb: Optional[int] = None,
+    docker_args: Optional[str] = None,
+    ports: Optional[str] = None,
+    volume_mount_path: Optional[str] = None,
     env: Optional[Dict[str, Any]] = None,
-    template_id=None,
-    network_volume_id=None,
+    template_id: Optional[str] = None,
+    network_volume_id: Optional[str] = None,
     allowed_cuda_versions: Optional[List[str]] = None,
     bid_per_gpu: Optional[float] = None,
 ) -> str:
@@ -425,7 +496,7 @@ def generate_pod_deployment_mutation(
         """
-def generate_pod_terminate_mutation(pod_id: str) -> str:
+def _generate_pod_terminate_mutation(pod_id: str) -> str:
     """
     Generates a mutation to terminate a pod.
     """
@@ -434,3 +505,118 @@ def generate_pod_terminate_mutation(pod_id: str) -> str:
         podTerminate(input: {{ podId: "{pod_id}" }})
     }}
     """
+def _generate_delete_cluster_mutation(cluster_id: str) -> str:
+    """
+    Generates a mutation to delete a cluster.
+    """
+    return f"""
+    mutation {{
+        deleteCluster(
+            input: {{
+                id: "{cluster_id}"
+            }}
+        )
+    }}
+    """
+def _generate_create_cluster_mutation(
+    cluster_name: str,
+    gpu_type_id: str,
+    pod_count: int,
+    gpu_count_per_pod: int,
+    image_name: str,
+    cluster_type: str,
+    deploy_cost: str,
+    template_id: Optional[str] = None,
+    network_volume_id: Optional[str] = None,
+    volume_in_gb: Optional[int] = None,
+    throughput: Optional[int] = None,
+    allowed_cuda_versions: Optional[List[str]] = None,
+    volume_key: Optional[str] = None,
+    data_center_id: Optional[str] = None,
+    start_jupyter: bool = False,
+    start_ssh: bool = False,
+    container_disk_in_gb: Optional[int] = None,
+    docker_args: Optional[str] = None,
+    env: Optional[Dict[str, Any]] = None,
+    volume_mount_path: Optional[str] = None,
+    ports: Optional[str] = None,
+) -> str:
+    """
+    Generates a mutation to create a cluster.
+    """
+    input_fields = []
+    # ------------------------------ Required Fields ----------------------------- #
+    input_fields.append(f'clusterName: "{cluster_name}"')
+    input_fields.append(f'gpuTypeId: "{gpu_type_id}"')
+    input_fields.append(f"podCount: {pod_count}")
+    input_fields.append(f'imageName: "{image_name}"')
+    input_fields.append(f"type: {cluster_type}")
+    input_fields.append(f"gpuCountPerPod: {gpu_count_per_pod}")
+    # If deploy_cost is not specified, Runpod returns Insufficient resources error.
+    input_fields.append(f"deployCost: {deploy_cost}")
+    # ------------------------------ Optional Fields ----------------------------- #
+    if template_id is not None:
+        input_fields.append(f'templateId: "{template_id}"')
+    if network_volume_id is not None:
+        input_fields.append(f'networkVolumeId: "{network_volume_id}"')
+    if volume_in_gb is not None:
+        input_fields.append(f"volumeInGb: {volume_in_gb}")
+    if throughput is not None:
+        input_fields.append(f"throughput: {throughput}")
+    if allowed_cuda_versions is not None:
+        allowed_cuda_versions_string = ", ".join(
+            [f'"{version}"' for version in allowed_cuda_versions]
+        )
+        input_fields.append(f"allowedCudaVersions: [{allowed_cuda_versions_string}]")
+    if volume_key is not None:
+        input_fields.append(f'volumeKey: "{volume_key}"')
+    if data_center_id is not None:
+        input_fields.append(f'dataCenterId: "{data_center_id}"')
+    if start_jupyter:
+        input_fields.append("startJupyter: true")
+    if start_ssh:
+        input_fields.append("startSsh: true")
+    if container_disk_in_gb is not None:
+        input_fields.append(f"containerDiskInGb: {container_disk_in_gb}")
+    if docker_args is not None:
+        input_fields.append(f'dockerArgs: "{docker_args}"')
+    if env is not None:
+        env_string = ", ".join(
+            [f'{{ key: "{key}", value: "{value}" }}' for key, value in env.items()]
+        )
+        input_fields.append(f"env: [{env_string}]")
+    if volume_mount_path is not None:
+        input_fields.append(f'volumeMountPath: "{volume_mount_path}"')
+    if ports is not None:
+        ports = ports.replace(" ", "")
+        input_fields.append(f'ports: "{ports}"')
+    # Format input fields
+    input_string = ", ".join(input_fields)
+    return f"""
+        mutation {{
+          createCluster(
+            input: {{
+              {input_string}
+            }}
+          ) {{
+            id
+            name
+            pods {{
+              id
+              clusterIp
+              lastStatusChange
+              imageName
+              machine {{
+                podHostId
+              }}
+            }}
+          }}
+        }}
+        """

dstack/_internal/core/backends/runpod/compute.py CHANGED Viewed

@@ -2,31 +2,34 @@ import json
 import uuid
 from collections.abc import Iterable
 from datetime import timedelta
-from typing import List, Optional
+from typing import Callable, List, Optional
 from dstack._internal.core.backends.base.backend import Compute
 from dstack._internal.core.backends.base.compute import (
     ComputeWithAllOffersCached,
+    ComputeWithGroupProvisioningSupport,
+    ComputeWithMultinodeSupport,
     ComputeWithVolumeSupport,
     generate_unique_instance_name,
     generate_unique_volume_name,
     get_docker_commands,
     get_job_instance_name,
 )
+from dstack._internal.core.backends.base.models import JobConfiguration
 from dstack._internal.core.backends.base.offers import (
     OfferModifier,
     get_catalog_offers,
     get_offers_disk_modifier,
 )
-from dstack._internal.core.backends.runpod.api_client import RunpodApiClient
+from dstack._internal.core.backends.runpod.api_client import RunpodApiClient, RunpodApiClientError
 from dstack._internal.core.backends.runpod.models import RunpodConfig
 from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
 from dstack._internal.core.errors import (
-    BackendError,
     ComputeError,
 )
 from dstack._internal.core.models.backends.base import BackendType
-from dstack._internal.core.models.common import RegistryAuth
+from dstack._internal.core.models.common import CoreModel, RegistryAuth
+from dstack._internal.core.models.compute_groups import ComputeGroup, ComputeGroupProvisioningData
 from dstack._internal.core.models.instances import (
     InstanceAvailability,
     InstanceConfiguration,
@@ -36,7 +39,7 @@ from dstack._internal.core.models.instances import (
 from dstack._internal.core.models.resources import Memory, Range
 from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
 from dstack._internal.core.models.volumes import Volume, VolumeProvisioningData
-from dstack._internal.utils.common import get_current_datetime
+from dstack._internal.utils.common import get_current_datetime, get_or_error
 from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
@@ -50,9 +53,15 @@ CONTAINER_REGISTRY_AUTH_CLEANUP_INTERVAL = 60 * 60 * 24  # 24 hour
 CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("1GB"), max=None)
+class RunpodOfferBackendData(CoreModel):
+    pod_counts: Optional[list[int]] = None
 class RunpodCompute(
     ComputeWithAllOffersCached,
     ComputeWithVolumeSupport,
+    ComputeWithMultinodeSupport,
+    ComputeWithGroupProvisioningSupport,
     Compute,
 ):
     _last_cleanup_time = None
@@ -80,6 +89,18 @@ class RunpodCompute(
     def get_offers_modifiers(self, requirements: Requirements) -> Iterable[OfferModifier]:
         return [get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)]
+    def get_offers_post_filter(
+        self, requirements: Requirements
+    ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
+        def offers_post_filter(offer: InstanceOfferWithAvailability) -> bool:
+            pod_counts = _get_offer_pod_counts(offer)
+            is_cluster_offer = len(pod_counts) > 0 and any(pc != 1 for pc in pod_counts)
+            if requirements.multinode:
+                return is_cluster_offer
+            return not is_cluster_offer
+        return offers_post_filter
     def run_job(
         self,
         run: Run,
@@ -151,6 +172,8 @@ class RunpodCompute(
         instance_id = resp["id"]
+        # Call edit_pod to pass container_registry_auth_id.
+        # Expect a long time (~5m) for the pod to pick up the creds.
         # TODO: remove editPod once createPod supports docker's username and password
         # editPod is temporary solution to set container_registry_auth_id because createPod does not
         # support it currently. This will be removed once createPod supports container_registry_auth_id
@@ -186,14 +209,127 @@ class RunpodCompute(
             backend_data=None,
         )
+    def run_jobs(
+        self,
+        run: Run,
+        job_configurations: List[JobConfiguration],
+        instance_offer: InstanceOfferWithAvailability,
+        project_ssh_public_key: str,
+        project_ssh_private_key: str,
+    ) -> ComputeGroupProvisioningData:
+        master_job_configuration = job_configurations[0]
+        master_job = master_job_configuration.job
+        master_job_volumes = master_job_configuration.volumes
+        all_volumes_names = set(v.name for jc in job_configurations for v in jc.volumes)
+        instance_config = InstanceConfiguration(
+            project_name=run.project_name,
+            instance_name=get_job_instance_name(run, master_job),
+            ssh_keys=[
+                SSHKey(public=get_or_error(run.run_spec.ssh_key_pub).strip()),
+                SSHKey(public=project_ssh_public_key.strip()),
+            ],
+            user=run.user,
+        )
+        pod_name = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
+        authorized_keys = instance_config.get_public_keys()
+        disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
+        network_volume_id = None
+        volume_mount_path = None
+        if len(master_job_volumes) > 1:
+            raise ComputeError("Mounting more than one network volume is not supported in runpod")
+        if len(all_volumes_names) > 1:
+            raise ComputeError(
+                "Mounting different volumes to different jobs is not supported in runpod"
+            )
+        if len(master_job_volumes) == 1:
+            network_volume_id = master_job_volumes[0].volume_id
+            volume_mount_path = run.run_spec.configuration.volumes[0].path
+        offer_pod_counts = _get_offer_pod_counts(instance_offer)
+        pod_count = len(job_configurations)
+        gpu_count = len(instance_offer.instance.resources.gpus)
+        data_center_id = instance_offer.region
+        if pod_count not in offer_pod_counts:
+            raise ComputeError(
+                f"Failed to provision {pod_count} pods. Available pod counts: {offer_pod_counts}"
+            )
+        container_registry_auth_id = self._generate_container_registry_auth_id(
+            master_job.job_spec.registry_auth
+        )
+        resp = self.api_client.create_cluster(
+            cluster_name=pod_name,
+            gpu_type_id=instance_offer.instance.name,
+            pod_count=pod_count,
+            gpu_count_per_pod=gpu_count,
+            deploy_cost=f"{instance_offer.price * pod_count:.2f}",
+            image_name=master_job.job_spec.image_name,
+            cluster_type="TRAINING",
+            data_center_id=data_center_id,
+            container_disk_in_gb=disk_size,
+            docker_args=_get_docker_args(authorized_keys),
+            ports=f"{DSTACK_RUNNER_SSH_PORT}/tcp",
+            network_volume_id=network_volume_id,
+            volume_mount_path=volume_mount_path,
+            env={"RUNPOD_POD_USER": "0"},
+        )
+        # An "edit pod" trick to pass container registry creds.
+        if container_registry_auth_id is not None:
+            for pod in resp["pods"]:
+                self.api_client.edit_pod(
+                    pod_id=pod["id"],
+                    image_name=master_job.job_spec.image_name,
+                    container_disk_in_gb=disk_size,
+                    container_registry_auth_id=container_registry_auth_id,
+                )
+        jpds = [
+            JobProvisioningData(
+                backend=instance_offer.backend,
+                instance_type=instance_offer.instance,
+                instance_id=pod["id"],
+                hostname=None,
+                internal_ip=pod["clusterIp"],
+                region=instance_offer.region,
+                price=instance_offer.price,
+                username="root",
+                dockerized=False,
+            )
+            for pod in resp["pods"]
+        ]
+        return ComputeGroupProvisioningData(
+            compute_group_id=resp["id"],
+            compute_group_name=resp["name"],
+            backend=BackendType.RUNPOD,
+            region=instance_offer.region,
+            job_provisioning_datas=jpds,
+        )
     def terminate_instance(
         self, instance_id: str, region: str, backend_data: Optional[str] = None
-    ) -> None:
+    ):
         try:
             self.api_client.terminate_pod(instance_id)
-        except BackendError as e:
-            if e.args[0] == "Instance Not Found":
-                logger.debug("The instance with name %s not found", instance_id)
+        except RunpodApiClientError as e:
+            if len(e.errors) > 0 and e.errors[0]["message"] == "pod not found to terminate":
+                logger.debug("The instance %s not found. Skipping deletion.", instance_id)
+                return
+            raise
+    def terminate_compute_group(self, compute_group: ComputeGroup):
+        provisioning_data = compute_group.provisioning_data
+        try:
+            self.api_client.delete_cluster(provisioning_data.compute_group_id)
+        except RunpodApiClientError as e:
+            if len(e.errors) > 0 and e.errors[0]["extensions"]["code"] == "Cluster not found":
+                logger.debug(
+                    "The cluster %s not found. Skipping deletion.",
+                    provisioning_data.compute_group_id,
+                )
                 return
             raise
@@ -216,7 +352,9 @@ class RunpodCompute(
                 provisioning_data.ssh_port = port["publicPort"]
     def register_volume(self, volume: Volume) -> VolumeProvisioningData:
-        volume_data = self.api_client.get_network_volume(volume_id=volume.configuration.volume_id)
+        volume_data = self.api_client.get_network_volume(
+            volume_id=get_or_error(volume.configuration.volume_id)
+        )
         if volume_data is None:
             raise ComputeError(f"Volume {volume.configuration.volume_id} not found")
         size_gb = volume_data["size"]
@@ -258,14 +396,12 @@ class RunpodCompute(
     ) -> Optional[str]:
         if registry_auth is None:
             return None
         return self.api_client.add_container_registry_auth(
             uuid.uuid4().hex, registry_auth.username, registry_auth.password
         )
     def _clean_stale_container_registry_auths(self) -> None:
         container_registry_auths = self.api_client.get_container_registry_auths()
         # Container_registry_auths sorted by creation time so try to delete the oldest first
         # when we reach container_registry_auths that is still in use, we stop
         for container_registry_auth in container_registry_auths:
@@ -289,9 +425,17 @@ def _get_volume_price(size: int) -> float:
     return 0.05 * size
-def _is_secure_cloud(region: str) -> str:
+def _is_secure_cloud(region: str) -> bool:
     """
     Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
     Community cloud regions are country codes: CA, NL, etc.
     """
     return "-" in region
+def _get_offer_pod_counts(offer: InstanceOfferWithAvailability) -> list[int]:
+    backend_data: RunpodOfferBackendData = RunpodOfferBackendData.__response__.parse_obj(
+        offer.backend_data
+    )
+    pod_counts = backend_data.pod_counts or []
+    return pod_counts

dstack 0.19.34__py3-none-any.whl → 0.19.35__py3-none-any.whl

Potentially problematic release.

dstack 0.19.34py3-none-any.whl → 0.19.35py3-none-any.whl