PyPI - dstack - Versions diffs - 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl - Mend

dstack 0.19.28py3-none-any.whl → 0.19.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (51) hide show

dstack/_internal/cli/main.py CHANGED Viewed

@@ -22,7 +22,7 @@ from dstack._internal.cli.commands.server import ServerCommand
 from dstack._internal.cli.commands.stats import StatsCommand
 from dstack._internal.cli.commands.stop import StopCommand
 from dstack._internal.cli.commands.volume import VolumeCommand
-from dstack._internal.cli.utils.common import _colors, console
+from dstack._internal.cli.utils.common import _colors, configure_logging, console
 from dstack._internal.cli.utils.updates import check_for_updates
 from dstack._internal.core.errors import ClientError, CLIError, ConfigurationError, SSHError
 from dstack._internal.core.services.ssh.client import get_ssh_client_info
@@ -39,6 +39,8 @@ def main():
     RichHelpFormatter.styles["argparse.groups"] = "bold grey74"
     RichHelpFormatter.styles["argparse.text"] = "grey74"
+    configure_logging()
     parser = argparse.ArgumentParser(
         description=(
             "Not sure where to start?"

dstack/_internal/cli/services/configurators/fleet.py CHANGED Viewed

@@ -159,12 +159,19 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator[Fle
         console.print(
             get_fleets_table(
                 [fleet],
-                verbose=_failed_provisioning(fleet),
+                verbose=_fleet_has_failed_instances(fleet),
                 format_date=local_time,
             )
         )
-        if _failed_provisioning(fleet):
-            console.print("\n[error]Some instances failed. Check the table above for errors.[/]")
+        if _fleet_has_failed_instances(fleet):
+            if _fleet_retrying(fleet):
+                console.print(
+                    "\n[error]Some instances failed. Provisioning will be retried in the background.[/]"
+                )
+            else:
+                console.print(
+                    "\n[error]Some instances failed. Check the table above for errors.[/]"
+                )
             exit(1)
     def _apply_plan_on_old_server(self, plan: FleetPlan, command_args: argparse.Namespace):
@@ -253,11 +260,11 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator[Fle
         console.print(
             get_fleets_table(
                 [fleet],
-                verbose=_failed_provisioning(fleet),
+                verbose=_fleet_has_failed_instances(fleet),
                 format_date=local_time,
             )
         )
-        if _failed_provisioning(fleet):
+        if _fleet_has_failed_instances(fleet):
             console.print("\n[error]Some instances failed. Check the table above for errors.[/]")
             exit(1)
@@ -462,13 +469,20 @@ def _finished_provisioning(fleet: Fleet) -> bool:
     return True
-def _failed_provisioning(fleet: Fleet) -> bool:
+def _fleet_has_failed_instances(fleet: Fleet) -> bool:
     for instance in fleet.instances:
         if instance.status == InstanceStatus.TERMINATED:
             return True
     return False
+def _fleet_retrying(fleet: Fleet) -> bool:
+    if fleet.spec.configuration.nodes is None:
+        return False
+    active_instances = [i for i in fleet.instances if i.status.is_active()]
+    return len(active_instances) < fleet.spec.configuration.nodes.min
 def _apply_plan(api: Client, plan: FleetPlan) -> Fleet:
     try:
         return api.client.fleets.apply_plan(

dstack/_internal/cli/utils/gpu.py CHANGED Viewed

@@ -9,7 +9,7 @@ from dstack._internal.core.models.runs import Requirements, RunSpec, get_policy_
 from dstack._internal.server.schemas.gpus import GpuGroup
-def print_gpu_json(gpu_response, run_spec, group_by_cli, api_project):
+def print_gpu_json(gpus, run_spec, group_by_cli, api_project):
     """Print GPU information in JSON format."""
     req = Requirements(
         resources=run_spec.configuration.resources,
@@ -36,7 +36,7 @@ def print_gpu_json(gpu_response, run_spec, group_by_cli, api_project):
         "gpus": [],
     }
-    for gpu_group in gpu_response.gpus:
+    for gpu_group in gpus:
         gpu_data = {
             "name": gpu_group.name,
             "memory_mib": gpu_group.memory_mib,

dstack/_internal/core/backends/aws/compute.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import threading
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 import boto3
 import botocore.client
@@ -18,6 +18,7 @@ from dstack._internal.core.backends.aws.models import (
 )
 from dstack._internal.core.backends.base.compute import (
     Compute,
+    ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
     ComputeWithGatewaySupport,
     ComputeWithMultinodeSupport,
@@ -32,7 +33,7 @@ from dstack._internal.core.backends.base.compute import (
     get_user_data,
     merge_tags,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers
+from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
 from dstack._internal.core.errors import (
     ComputeError,
     NoCapacityError,
@@ -87,6 +88,7 @@ def _ec2client_cache_methodkey(self, ec2_client, *args, **kwargs):
 class AWSCompute(
+    ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
     ComputeWithMultinodeSupport,
     ComputeWithReservationSupport,
@@ -109,6 +111,8 @@ class AWSCompute(
         # Caches to avoid redundant API calls when provisioning many instances
         # get_offers is already cached but we still cache its sub-functions
         # with more aggressive/longer caches.
+        self._offers_post_filter_cache_lock = threading.Lock()
+        self._offers_post_filter_cache = TTLCache(maxsize=10, ttl=180)
         self._get_regions_to_quotas_cache_lock = threading.Lock()
         self._get_regions_to_quotas_execution_lock = threading.Lock()
         self._get_regions_to_quotas_cache = TTLCache(maxsize=10, ttl=300)
@@ -125,43 +129,11 @@ class AWSCompute(
         self._get_image_id_and_username_cache_lock = threading.Lock()
         self._get_image_id_and_username_cache = TTLCache(maxsize=100, ttl=600)
-    def get_offers(
-        self, requirements: Optional[Requirements] = None
-    ) -> List[InstanceOfferWithAvailability]:
-        filter = _supported_instances
-        if requirements and requirements.reservation:
-            region_to_reservation = {}
-            for region in self.config.regions:
-                reservation = aws_resources.get_reservation(
-                    ec2_client=self.session.client("ec2", region_name=region),
-                    reservation_id=requirements.reservation,
-                    instance_count=1,
-                )
-                if reservation is not None:
-                    region_to_reservation[region] = reservation
-            def _supported_instances_with_reservation(offer: InstanceOffer) -> bool:
-                # Filter: only instance types supported by dstack
-                if not _supported_instances(offer):
-                    return False
-                # Filter: Spot instances can't be used with reservations
-                if offer.instance.resources.spot:
-                    return False
-                region = offer.region
-                reservation = region_to_reservation.get(region)
-                # Filter: only instance types matching the capacity reservation
-                if not bool(reservation and offer.instance.name == reservation["InstanceType"]):
-                    return False
-                return True
-            filter = _supported_instances_with_reservation
+    def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.AWS,
             locations=self.config.regions,
-            requirements=requirements,
-            configurable_disk_size=CONFIGURABLE_DISK_SIZE,
-            extra_filter=filter,
+            extra_filter=_supported_instances,
         )
         regions = list(set(i.region for i in offers))
         with self._get_regions_to_quotas_execution_lock:
@@ -185,6 +157,49 @@ class AWSCompute(
             )
         return availability_offers
+    def get_offers_modifier(
+        self, requirements: Requirements
+    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
+        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
+    def _get_offers_cached_key(self, requirements: Requirements) -> int:
+        # Requirements is not hashable, so we use a hack to get arguments hash
+        return hash(requirements.json())
+    @cachedmethod(
+        cache=lambda self: self._offers_post_filter_cache,
+        key=_get_offers_cached_key,
+        lock=lambda self: self._offers_post_filter_cache_lock,
+    )
+    def get_offers_post_filter(
+        self, requirements: Requirements
+    ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
+        if requirements.reservation:
+            region_to_reservation = {}
+            for region in get_or_error(self.config.regions):
+                reservation = aws_resources.get_reservation(
+                    ec2_client=self.session.client("ec2", region_name=region),
+                    reservation_id=requirements.reservation,
+                    instance_count=1,
+                )
+                if reservation is not None:
+                    region_to_reservation[region] = reservation
+            def reservation_filter(offer: InstanceOfferWithAvailability) -> bool:
+                # Filter: Spot instances can't be used with reservations
+                if offer.instance.resources.spot:
+                    return False
+                region = offer.region
+                reservation = region_to_reservation.get(region)
+                # Filter: only instance types matching the capacity reservation
+                if not bool(reservation and offer.instance.name == reservation["InstanceType"]):
+                    return False
+                return True
+            return reservation_filter
+        return None
     def terminate_instance(
         self, instance_id: str, region: str, backend_data: Optional[str] = None
     ) -> None:
@@ -276,7 +291,11 @@ class AWSCompute(
                 image_id, username = self._get_image_id_and_username(
                     ec2_client=ec2_client,
                     region=instance_offer.region,
-                    cuda=len(instance_offer.instance.resources.gpus) > 0,
+                    gpu_name=(
+                        instance_offer.instance.resources.gpus[0].name
+                        if len(instance_offer.instance.resources.gpus) > 0
+                        else None
+                    ),
                     instance_type=instance_offer.instance.name,
                     image_config=self.config.os_images,
                 )
@@ -882,11 +901,13 @@ class AWSCompute(
         self,
         ec2_client: botocore.client.BaseClient,
         region: str,
-        cuda: bool,
+        gpu_name: Optional[str],
         instance_type: str,
         image_config: Optional[AWSOSImageConfig] = None,
     ) -> tuple:
-        return hashkey(region, cuda, instance_type, image_config.json() if image_config else None)
+        return hashkey(
+            region, gpu_name, instance_type, image_config.json() if image_config else None
+        )
     @cachedmethod(
         cache=lambda self: self._get_image_id_and_username_cache,
@@ -897,13 +918,13 @@ class AWSCompute(
         self,
         ec2_client: botocore.client.BaseClient,
         region: str,
-        cuda: bool,
+        gpu_name: Optional[str],
         instance_type: str,
         image_config: Optional[AWSOSImageConfig] = None,
     ) -> tuple[str, str]:
         return aws_resources.get_image_id_and_username(
             ec2_client=ec2_client,
-            cuda=cuda,
+            gpu_name=gpu_name,
             instance_type=instance_type,
             image_config=image_config,
         )

dstack/_internal/core/backends/aws/resources.py CHANGED Viewed

@@ -6,6 +6,8 @@ import botocore.exceptions
 import dstack.version as version
 from dstack._internal.core.backends.aws.models import AWSOSImageConfig
+from dstack._internal.core.backends.base.compute import requires_nvidia_proprietary_kernel_modules
+from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
 from dstack._internal.core.errors import BackendError, ComputeError, ComputeResourceNotFoundError
 from dstack._internal.utils.logging import get_logger
@@ -17,14 +19,14 @@ DLAMI_OWNER_ACCOUNT_ID = "898082745236"
 def get_image_id_and_username(
     ec2_client: botocore.client.BaseClient,
-    cuda: bool,
+    gpu_name: Optional[str],
     instance_type: str,
     image_config: Optional[AWSOSImageConfig] = None,
 ) -> tuple[str, str]:
     if image_config is not None:
-        image = image_config.nvidia if cuda else image_config.cpu
+        image = image_config.nvidia if gpu_name else image_config.cpu
         if image is None:
-            logger.warning("%s image not configured", "nvidia" if cuda else "cpu")
+            logger.warning("%s image not configured", "nvidia" if gpu_name else "cpu")
             raise ComputeResourceNotFoundError()
         image_name = image.name
         image_owner = image.owner
@@ -35,9 +37,12 @@ def get_image_id_and_username(
         image_owner = DLAMI_OWNER_ACCOUNT_ID
         username = "ubuntu"
     else:
-        image_name = (
-            f"dstack-{version.base_image}" if not cuda else f"dstack-cuda-{version.base_image}"
-        )
+        if gpu_name is None:
+            image_name = f"dstack-{version.base_image}"
+        elif not requires_nvidia_proprietary_kernel_modules(gpu_name):
+            image_name = f"dstack-cuda-{version.base_image}"
+        else:
+            image_name = f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
         image_owner = DSTACK_ACCOUNT_ID
         username = "ubuntu"
     response = ec2_client.describe_images(

dstack/_internal/core/backends/azure/compute.py CHANGED Viewed

@@ -2,7 +2,7 @@ import base64
 import enum
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
 from azure.core.credentials import TokenCredential
 from azure.core.exceptions import ResourceExistsError, ResourceNotFoundError
@@ -39,6 +39,7 @@ from dstack._internal.core.backends.azure import utils as azure_utils
 from dstack._internal.core.backends.azure.models import AzureConfig
 from dstack._internal.core.backends.base.compute import (
     Compute,
+    ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
     ComputeWithGatewaySupport,
     ComputeWithMultinodeSupport,
@@ -47,8 +48,10 @@ from dstack._internal.core.backends.base.compute import (
     get_gateway_user_data,
     get_user_data,
     merge_tags,
+    requires_nvidia_proprietary_kernel_modules,
 )
-from dstack._internal.core.backends.base.offers import get_catalog_offers
+from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
+from dstack._internal.core.consts import DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES
 from dstack._internal.core.errors import ComputeError, NoCapacityError
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.gateways import (
@@ -73,6 +76,7 @@ CONFIGURABLE_DISK_SIZE = Range[Memory](min=Memory.parse("30GB"), max=Memory.pars
 class AzureCompute(
+    ComputeWithAllOffersCached,
     ComputeWithCreateInstanceSupport,
     ComputeWithMultinodeSupport,
     ComputeWithGatewaySupport,
@@ -89,14 +93,10 @@ class AzureCompute(
             credential=credential, subscription_id=config.subscription_id
         )
-    def get_offers(
-        self, requirements: Optional[Requirements] = None
-    ) -> List[InstanceOfferWithAvailability]:
+    def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.AZURE,
             locations=self.config.regions,
-            requirements=requirements,
-            configurable_disk_size=CONFIGURABLE_DISK_SIZE,
             extra_filter=_supported_instances,
         )
         offers_with_availability = _get_offers_with_availability(
@@ -106,6 +106,11 @@ class AzureCompute(
         )
         return offers_with_availability
+    def get_offers_modifier(
+        self, requirements: Requirements
+    ) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
+        return get_offers_disk_modifier(CONFIGURABLE_DISK_SIZE, requirements)
     def create_instance(
         self,
         instance_offer: InstanceOfferWithAvailability,
@@ -369,6 +374,7 @@ def _parse_config_vpc_id(vpc_id: str) -> Tuple[str, str]:
 class VMImageVariant(enum.Enum):
     GRID = enum.auto()
     CUDA = enum.auto()
+    CUDA_WITH_PROPRIETARY_KERNEL_MODULES = enum.auto()
     STANDARD = enum.auto()
     @classmethod
@@ -376,18 +382,24 @@ class VMImageVariant(enum.Enum):
         if "_A10_v5" in instance.name:
             return cls.GRID
         elif len(instance.resources.gpus) > 0:
-            return cls.CUDA
+            if not requires_nvidia_proprietary_kernel_modules(instance.resources.gpus[0].name):
+                return cls.CUDA
+            else:
+                return cls.CUDA_WITH_PROPRIETARY_KERNEL_MODULES
         else:
             return cls.STANDARD
     def get_image_name(self) -> str:
-        name = "dstack-"
         if self is self.GRID:
-            name += "grid-"
+            return f"dstack-grid-{version.base_image}"
         elif self is self.CUDA:
-            name += "cuda-"
-        name += version.base_image
-        return name
+            return f"dstack-cuda-{version.base_image}"
+        elif self is self.CUDA_WITH_PROPRIETARY_KERNEL_MODULES:
+            return f"dstack-cuda-{DSTACK_OS_IMAGE_WITH_PROPRIETARY_NVIDIA_KERNEL_MODULES}"
+        elif self is self.STANDARD:
+            return f"dstack-{version.base_image}"
+        else:
+            raise ValueError(f"Unexpected image variant {self!r}")
 _SUPPORTED_VM_SERIES_PATTERNS = [

dstack/_internal/core/backends/base/compute.py CHANGED Viewed

@@ -7,7 +7,7 @@ from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from functools import lru_cache
 from pathlib import Path
-from typing import Dict, List, Literal, Optional
+from typing import Callable, Dict, List, Literal, Optional
 import git
 import requests
@@ -15,6 +15,7 @@ import yaml
 from cachetools import TTLCache, cachedmethod
 from dstack._internal import settings
+from dstack._internal.core.backends.base.offers import filter_offers_by_requirements
 from dstack._internal.core.consts import (
     DSTACK_RUNNER_HTTP_PORT,
     DSTACK_RUNNER_SSH_PORT,
@@ -47,6 +48,22 @@ logger = get_logger(__name__)
 DSTACK_SHIM_BINARY_NAME = "dstack-shim"
 DSTACK_RUNNER_BINARY_NAME = "dstack-runner"
 DEFAULT_PRIVATE_SUBNETS = ("10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16")
+NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES = frozenset(
+    # All NVIDIA architectures prior to Turing do not support Open Kernel Modules and require
+    # proprietary modules. This list is incomplete, update when necessary.
+    [
+        "v100",
+        "p100",
+        "p40",
+        "p4",
+        "m60",
+        "m40",
+        "m4",
+        "k80",
+        "k40",
+        "k20",
+    ]
+)
 GoArchType = Literal["amd64", "arm64"]
@@ -57,14 +74,8 @@ class Compute(ABC):
     If a compute supports additional features, it must also subclass `ComputeWith*` classes.
     """
-    def __init__(self):
-        self._offers_cache_lock = threading.Lock()
-        self._offers_cache = TTLCache(maxsize=10, ttl=180)
     @abstractmethod
-    def get_offers(
-        self, requirements: Optional[Requirements] = None
-    ) -> List[InstanceOfferWithAvailability]:
+    def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
         """
         Returns offers with availability matching `requirements`.
         If the provider is added to gpuhunt, typically gets offers using `base.offers.get_catalog_offers()`
@@ -121,10 +132,97 @@ class Compute(ABC):
         """
         pass
-    def _get_offers_cached_key(self, requirements: Optional[Requirements] = None) -> int:
+class ComputeWithAllOffersCached(ABC):
+    """
+    Provides common `get_offers()` implementation for backends
+    whose offers do not depend on requirements.
+    It caches all offers with availability and post-filters by requirements.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._offers_cache_lock = threading.Lock()
+        self._offers_cache = TTLCache(maxsize=1, ttl=180)
+    @abstractmethod
+    def get_all_offers_with_availability(self) -> List[InstanceOfferWithAvailability]:
+        """
+        Returns all backend offers with availability.
+        """
+        pass
+    def get_offers_modifier(
+        self, requirements: Requirements
+    ) -> Optional[
+        Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]
+    ]:
+        """
+        Returns a modifier function that modifies offers before they are filtered by requirements.
+        Can return `None` to exclude the offer.
+        E.g. can be used to set appropriate disk size based on requirements.
+        """
+        return None
+    def get_offers_post_filter(
+        self, requirements: Requirements
+    ) -> Optional[Callable[[InstanceOfferWithAvailability], bool]]:
+        """
+        Returns a filter function to apply to offers based on requirements.
+        This allows backends to implement custom post-filtering logic for specific requirements.
+        """
+        return None
+    def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
+        offers = self._get_all_offers_with_availability_cached()
+        modifier = self.get_offers_modifier(requirements)
+        if modifier is not None:
+            modified_offers = []
+            for o in offers:
+                modified_offer = modifier(o)
+                if modified_offer is not None:
+                    modified_offers.append(modified_offer)
+            offers = modified_offers
+        offers = filter_offers_by_requirements(offers, requirements)
+        post_filter = self.get_offers_post_filter(requirements)
+        if post_filter is not None:
+            offers = [o for o in offers if post_filter(o)]
+        return offers
+    @cachedmethod(
+        cache=lambda self: self._offers_cache,
+        lock=lambda self: self._offers_cache_lock,
+    )
+    def _get_all_offers_with_availability_cached(self) -> List[InstanceOfferWithAvailability]:
+        return self.get_all_offers_with_availability()
+class ComputeWithFilteredOffersCached(ABC):
+    """
+    Provides common `get_offers()` implementation for backends
+    whose offers depend on requirements.
+    It caches offers using requirements as key.
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._offers_cache_lock = threading.Lock()
+        self._offers_cache = TTLCache(maxsize=10, ttl=180)
+    @abstractmethod
+    def get_offers_by_requirements(
+        self, requirements: Requirements
+    ) -> List[InstanceOfferWithAvailability]:
+        """
+        Returns backend offers with availability matching requirements.
+        """
+        pass
+    def get_offers(self, requirements: Requirements) -> List[InstanceOfferWithAvailability]:
+        return self._get_offers_cached(requirements)
+    def _get_offers_cached_key(self, requirements: Requirements) -> int:
         # Requirements is not hashable, so we use a hack to get arguments hash
-        if requirements is None:
-            return hash(None)
         return hash(requirements.json())
     @cachedmethod(
@@ -132,10 +230,10 @@ class Compute(ABC):
         key=_get_offers_cached_key,
         lock=lambda self: self._offers_cache_lock,
     )
-    def get_offers_cached(
-        self, requirements: Optional[Requirements] = None
+    def _get_offers_cached(
+        self, requirements: Requirements
     ) -> List[InstanceOfferWithAvailability]:
-        return self.get_offers(requirements)
+        return self.get_offers_by_requirements(requirements)
 class ComputeWithCreateInstanceSupport(ABC):
@@ -887,3 +985,12 @@ def merge_tags(
         for k, v in resource_tags.items():
             res.setdefault(k, v)
     return res
+def requires_nvidia_proprietary_kernel_modules(gpu_name: str) -> bool:
+    """
+    Returns:
+        Whether this NVIDIA GPU requires NVIDIA proprietary kernel modules
+        instead of open kernel modules.
+    """
+    return gpu_name.lower() in NVIDIA_GPUS_REQUIRING_PROPRIETARY_KERNEL_MODULES

dstack/_internal/core/backends/base/offers.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from dataclasses import asdict
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, TypeVar
 import gpuhunt
 from pydantic import parse_obj_as
@@ -9,11 +9,13 @@ from dstack._internal.core.models.instances import (
     Disk,
     Gpu,
     InstanceOffer,
+    InstanceOfferWithAvailability,
     InstanceType,
     Resources,
 )
 from dstack._internal.core.models.resources import DEFAULT_DISK, CPUSpec, Memory, Range
 from dstack._internal.core.models.runs import Requirements
+from dstack._internal.utils.common import get_or_error
 # Offers not supported by all dstack versions are hidden behind one or more flags.
 # This list enables the flags that are currently supported.
@@ -163,9 +165,13 @@ def requirements_to_query_filter(req: Optional[Requirements]) -> gpuhunt.QueryFi
     return q
-def match_requirements(
-    offers: List[InstanceOffer], requirements: Optional[Requirements]
-) -> List[InstanceOffer]:
+InstanceOfferT = TypeVar("InstanceOfferT", InstanceOffer, InstanceOfferWithAvailability)
+def filter_offers_by_requirements(
+    offers: List[InstanceOfferT],
+    requirements: Optional[Requirements],
+) -> List[InstanceOfferT]:
     query_filter = requirements_to_query_filter(requirements)
     filtered_offers = []
     for offer in offers:
@@ -190,3 +196,27 @@ def choose_disk_size_mib(
         disk_size_gib = disk_size_range.min
     return round(disk_size_gib * 1024)
+def get_offers_disk_modifier(
+    configurable_disk_size: Range[Memory], requirements: Requirements
+) -> Callable[[InstanceOfferWithAvailability], Optional[InstanceOfferWithAvailability]]:
+    """
+    Returns a func that modifies offers disk by setting min value that satisfies both
+    `configurable_disk_size` and `requirements`.
+    """
+    def modifier(offer: InstanceOfferWithAvailability) -> Optional[InstanceOfferWithAvailability]:
+        requirements_disk_range = DEFAULT_DISK.size
+        if requirements.resources.disk is not None:
+            requirements_disk_range = requirements.resources.disk.size
+        disk_size_range = requirements_disk_range.intersect(configurable_disk_size)
+        if disk_size_range is None:
+            return None
+        offer_copy = offer.copy(deep=True)
+        offer_copy.instance.resources.disk = Disk(
+            size_mib=get_or_error(disk_size_range.min) * 1024
+        )
+        return offer_copy
+    return modifier

dstack 0.19.28__py3-none-any.whl → 0.19.30__py3-none-any.whl

Potentially problematic release.

dstack 0.19.28py3-none-any.whl → 0.19.30py3-none-any.whl