PyPI - dstack - Versions diffs - 0.19.31__py3-none-any.whl → 0.19.32__py3-none-any.whl - Mend

dstack 0.19.31py3-none-any.whl → 0.19.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (18) hide show

dstack/_internal/core/backends/hotaisle/compute.py CHANGED Viewed

@@ -42,6 +42,26 @@ INSTANCE_TYPE_SPECS = {
         "cpu_frequency": 2000000000,
         "cpu_manufacturer": "Intel",
     },
+    "2x MI300X 26x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "2x MI300X 26x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "4x MI300X 52x Xeon Platinum 8462Y": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "4x MI300X 52x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
 }

dstack/_internal/core/backends/kubernetes/compute.py CHANGED Viewed

@@ -2,6 +2,7 @@ import subprocess
 import tempfile
 import threading
 import time
+from enum import Enum
 from typing import List, Optional, Tuple
 from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
@@ -62,9 +63,28 @@ JUMP_POD_SSH_PORT = 22
 NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
 NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
+NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
+NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
+NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
+NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
+# Taints we know and tolerate when creating our objects, e.g., the jump pod.
+TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
 DUMMY_REGION = "-"
+class Operator(str, Enum):
+    EXISTS = "Exists"
+    IN = "In"
+class TaintEffect(str, Enum):
+    NO_EXECUTE = "NoExecute"
+    NO_SCHEDULE = "NoSchedule"
+    PREFER_NO_SCHEDULE = "PreferNoSchedule"
 class KubernetesCompute(
     ComputeWithFilteredOffersCached,
     ComputeWithPrivilegedSupport,
@@ -181,6 +201,7 @@ class KubernetesCompute(
         resources_requests: dict[str, str] = {}
         resources_limits: dict[str, str] = {}
         node_affinity: Optional[client.V1NodeAffinity] = None
+        tolerations: list[client.V1Toleration] = []
         volumes_: list[client.V1Volume] = []
         volume_mounts: list[client.V1VolumeMount] = []
@@ -226,21 +247,28 @@ class KubernetesCompute(
                     "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
                 )
                 # TODO: support other GPU vendors
-                resources_requests["nvidia.com/gpu"] = str(gpu_min)
-                resources_limits["nvidia.com/gpu"] = str(gpu_min)
+                resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
+                resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
                 node_affinity = client.V1NodeAffinity(
                     required_during_scheduling_ignored_during_execution=[
                         client.V1NodeSelectorTerm(
                             match_expressions=[
                                 client.V1NodeSelectorRequirement(
-                                    key="nvidia.com/gpu.product",
-                                    operator="In",
+                                    key=NVIDIA_GPU_PRODUCT_LABEL,
+                                    operator=Operator.IN,
                                     values=list(matching_gpu_label_values),
                                 ),
                             ],
                         ),
                     ],
                 )
+                # It should be NoSchedule, but we also add NoExecute toleration just in case.
+                for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
+                    tolerations.append(
+                        client.V1Toleration(
+                            key=NVIDIA_GPU_NODE_TAINT, operator=Operator.EXISTS, effect=effect
+                        )
+                    )
         if (memory_min := resources_spec.memory.min) is not None:
             resources_requests["memory"] = _render_memory(memory_min)
@@ -304,6 +332,7 @@ class KubernetesCompute(
                     )
                 ],
                 affinity=node_affinity,
+                tolerations=tolerations,
                 volumes=volumes_,
             ),
         )
@@ -527,8 +556,8 @@ def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optio
     # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
     # Thus, we convert the product name to a known gpu name.
     # TODO: support other GPU vendors
-    gpu_count = labels.get("nvidia.com/gpu.count")
-    gpu_product = labels.get("nvidia.com/gpu.product")
+    gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
+    gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
     if gpu_count is None or gpu_product is None:
         return [], None
     gpu_count = int(gpu_count)
@@ -647,6 +676,39 @@ def _create_jump_pod_service(
         namespace=namespace,
         name=pod_name,
     )
+    node_list = call_api_method(api.list_node, client.V1NodeList)
+    nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
+    # False if we found at least one node without any "hard" taint, that is, if we don't need to
+    # specify the toleration.
+    toleration_required = True
+    # (key, effect) pairs.
+    tolerated_taints: set[tuple[str, str]] = set()
+    for node in nodes:
+        # True if the node has at least one NoExecute or NoSchedule taint.
+        has_hard_taint = False
+        taints = get_value(node, ".spec.taints", list[client.V1Taint]) or []
+        for taint in taints:
+            effect = get_value(taint, ".effect", str, required=True)
+            # A "soft" taint, ignore.
+            if effect == TaintEffect.PREFER_NO_SCHEDULE:
+                continue
+            has_hard_taint = True
+            key = get_value(taint, ".key", str, required=True)
+            if key in TOLERATED_NODE_TAINTS:
+                tolerated_taints.add((key, effect))
+        if not has_hard_taint:
+            toleration_required = False
+            break
+    tolerations: list[client.V1Toleration] = []
+    if toleration_required:
+        for key, effect in tolerated_taints:
+            tolerations.append(
+                client.V1Toleration(key=key, operator=Operator.EXISTS, effect=effect)
+            )
+        if not tolerations:
+            logger.warning("No appropriate node found, the jump pod may never be scheduled")
     commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
     pod = client.V1Pod(
         metadata=client.V1ObjectMeta(
@@ -667,7 +729,8 @@ def _create_jump_pod_service(
                         )
                     ],
                 )
-            ]
+            ],
+            tolerations=tolerations,
         ),
     )
     call_api_method(

dstack/_internal/core/backends/nebius/compute.py CHANGED Viewed

@@ -19,6 +19,7 @@ from dstack._internal.core.backends.base.compute import (
     ComputeWithPrivilegedSupport,
     generate_unique_instance_name,
     get_user_data,
+    merge_tags,
 )
 from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
 from dstack._internal.core.backends.nebius import resources
@@ -150,6 +151,18 @@ class NebiusCompute(
             if backend_data.cluster is not None:
                 cluster_id = backend_data.cluster.id
+        labels = {
+            "owner": "dstack",
+            "dstack_project": instance_config.project_name.lower(),
+            "dstack_name": instance_config.instance_name,
+            "dstack_user": instance_config.user.lower(),
+        }
+        labels = merge_tags(
+            base_tags=labels,
+            backend_tags=self.config.tags,
+            resource_tags=instance_config.tags,
+        )
+        labels = resources.filter_invalid_labels(labels)
         gpus = instance_offer.instance.resources.gpus
         create_disk_op = resources.create_disk(
             sdk=self._sdk,
@@ -159,6 +172,7 @@ class NebiusCompute(
             image_family="ubuntu24.04-cuda12"
             if gpus and gpus[0].name == "B200"
             else "ubuntu22.04-cuda12",
+            labels=labels,
         )
         create_instance_op = None
         try:
@@ -184,6 +198,7 @@ class NebiusCompute(
                 disk_id=create_disk_op.resource_id,
                 subnet_id=self._get_subnet_id(instance_offer.region),
                 preemptible=instance_offer.instance.resources.spot,
+                labels=labels,
             )
             _wait_for_instance(self._sdk, create_instance_op)
         except BaseException:

dstack/_internal/core/backends/nebius/configurator.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 from nebius.aio.service_error import RequestError
 from dstack._internal.core.backends.base.configurator import (
+    TAGS_MAX_NUM,
     BackendRecord,
     Configurator,
     raise_invalid_credentials_error,
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
     NebiusServiceAccountCreds,
     NebiusStoredConfig,
 )
+from dstack._internal.core.errors import BackendError, ServerClientError
 from dstack._internal.core.models.backends.base import BackendType
@@ -53,6 +55,19 @@ class NebiusConfigurator(
                     f" some of the valid options: {sorted(valid_fabrics)}"
                 ),
             )
+        self._check_config_tags(config)
+    def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
+        if not config.tags:
+            return
+        if len(config.tags) > TAGS_MAX_NUM:
+            raise ServerClientError(
+                f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
+            )
+        try:
+            resources.validate_labels(config.tags)
+        except BackendError as e:
+            raise ServerClientError(e.args[0])
     def create_backend(
         self, project_name: str, config: NebiusBackendConfigWithCreds

dstack/_internal/core/backends/nebius/models.py CHANGED Viewed

@@ -1,4 +1,6 @@
-from typing import Annotated, Literal, Optional, Union
+import json
+from pathlib import Path
+from typing import Annotated, Dict, Literal, Optional, Union
 from pydantic import Field, root_validator
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
             )
         ),
     ]
+    filename: Annotated[
+        Optional[str], Field(description="The path to the service account credentials file")
+    ] = None
 class NebiusServiceAccountFileCreds(CoreModel):
     type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
         "service_account"
     )
-    service_account_id: Annotated[str, Field(description="Service account ID")]
-    public_key_id: Annotated[str, Field(description="ID of the service account public key")]
+    service_account_id: Annotated[
+        Optional[str],
+        Field(
+            description=(
+                "Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
+            )
+        ),
+    ] = None
+    public_key_id: Annotated[
+        Optional[str],
+        Field(
+            description=(
+                "ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
+            )
+        ),
+    ] = None
     private_key_file: Annotated[
-        Optional[str], Field(description=("Path to the service account private key"))
+        Optional[str],
+        Field(
+            description=(
+                "Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
+            )
+        ),
     ] = None
     private_key_content: Annotated[
         Optional[str],
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
             description=(
                 "Content of the service account private key. When configuring via"
                 " `server/config.yml`, it's automatically filled from `private_key_file`."
-                " When configuring via UI, it has to be specified explicitly."
+                " When configuring via UI, it has to be specified explicitly"
             )
         ),
     ] = None
+    filename: Annotated[
+        Optional[str], Field(description="The path to the service account credentials file")
+    ] = None
     @root_validator
     def fill_data(cls, values):
+        if filename := values.get("filename"):
+            try:
+                with open(Path(filename).expanduser()) as f:
+                    data = json.load(f)
+                from nebius.base.service_account.credentials_file import (
+                    ServiceAccountCredentials,
+                )
+                credentials = ServiceAccountCredentials.from_json(data)
+                subject = credentials.subject_credentials
+                values["service_account_id"] = subject.sub
+                values["public_key_id"] = subject.kid
+                values["private_key_content"] = subject.private_key
+            except OSError:
+                raise ValueError(f"No such file {filename}")
+            except Exception as e:
+                raise ValueError(f"Failed to parse credentials file {filename}: {e}")
+            return values
         return fill_data(
             values, filename_field="private_key_file", data_field="private_key_content"
         )
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
             )
         ),
     ] = None
+    tags: Annotated[
+        Optional[Dict[str, str]],
+        Field(
+            description="The tags (labels) that will be assigned to resources created by `dstack`"
+        ),
+    ] = None
 class NebiusBackendConfigWithCreds(NebiusBackendConfig):

dstack/_internal/core/backends/nebius/resources.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import logging
+import re
 import time
 from collections import defaultdict
 from collections.abc import Container as ContainerT
 from collections.abc import Generator, Iterable, Sequence
 from contextlib import contextmanager
 from tempfile import NamedTemporaryFile
-from typing import Optional
+from typing import Dict, Optional
 from nebius.aio.authorization.options import options_to_metadata
 from nebius.aio.operation import Operation as SDKOperation
@@ -249,13 +250,14 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
 def create_disk(
-    sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
+    sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
 ) -> SDKOperation[Operation]:
     client = DiskServiceClient(sdk)
     request = CreateDiskRequest(
         metadata=ResourceMetadata(
             name=name,
             parent_id=project_id,
+            labels=labels,
         ),
         spec=DiskSpec(
             size_mebibytes=size_mib,
@@ -288,12 +290,14 @@ def create_instance(
     disk_id: str,
     subnet_id: str,
     preemptible: bool,
+    labels: Dict[str, str],
 ) -> SDKOperation[Operation]:
     client = InstanceServiceClient(sdk)
     request = CreateInstanceRequest(
         metadata=ResourceMetadata(
             name=name,
             parent_id=project_id,
+            labels=labels,
         ),
         spec=InstanceSpec(
             cloud_init_user_data=user_data,
@@ -367,3 +371,42 @@ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
             metadata=REQUEST_MD,
         )
     )
+def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
+    filtered_labels = {}
+    for k, v in labels.items():
+        if not _is_valid_label(k, v):
+            logger.warning("Skipping invalid label '%s: %s'", k, v)
+            continue
+        filtered_labels[k] = v
+    return filtered_labels
+def validate_labels(labels: Dict[str, str]):
+    for k, v in labels.items():
+        if not _is_valid_label(k, v):
+            raise BackendError("Invalid resource labels")
+def _is_valid_label(key: str, value: str) -> bool:
+    # TODO: [Nebius] current validation logic reuses GCP's approach.
+    #   There is no public information on Nebius labels restrictions.
+    return is_valid_resource_name(key) and is_valid_label_value(value)
+MAX_RESOURCE_NAME_LEN = 63
+NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
+LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
+def is_valid_resource_name(name: str) -> bool:
+    if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
+        return False
+    match = re.match(NAME_PATTERN, name)
+    return match is not None
+def is_valid_label_value(value: str) -> bool:
+    match = re.match(LABEL_VALUE_PATTERN, value)
+    return match is not None

dstack/_internal/core/compatibility/runs.py CHANGED Viewed

@@ -53,6 +53,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
             }
         if all(js.exit_status is None for js in job_submissions):
             job_submissions_excludes["exit_status"] = True
+        if all(js.status_message == "" for js in job_submissions):
+            job_submissions_excludes["status_message"] = True
+        if all(js.error is None for js in job_submissions):
+            job_submissions_excludes["error"] = True
         if all(js.deployment_num == 0 for js in job_submissions):
             job_submissions_excludes["deployment_num"] = True
         if all(not js.probes for js in job_submissions):
@@ -71,6 +75,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
                 }
             if latest_job_submission.exit_status is None:
                 latest_job_submission_excludes["exit_status"] = True
+            if latest_job_submission.status_message == "":
+                latest_job_submission_excludes["status_message"] = True
+            if latest_job_submission.error is None:
+                latest_job_submission_excludes["error"] = True
             if latest_job_submission.deployment_num == 0:
                 latest_job_submission_excludes["deployment_num"] = True
             if not latest_job_submission.probes:

dstack/_internal/core/models/profiles.py CHANGED Viewed

@@ -80,14 +80,21 @@ def parse_stop_duration(
 def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[Literal["off"], int]]:
     if v == "off" or v is False:
         return "off"
-    if v is True:
+    if v is True or v is None:
         return None
-    return parse_duration(v)
+    duration = parse_duration(v)
+    if duration < 0:
+        raise ValueError("Duration cannot be negative")
+    return duration
-def parse_idle_duration(v: Optional[Union[int, str]]) -> Optional[int]:
-    if v == "off" or v == -1:
+def parse_idle_duration(v: Optional[Union[int, str, bool]]) -> Optional[int]:
+    # Differs from `parse_off_duration` to accept negative durations as `off`
+    # for backward compatibility.
+    if v == "off" or v is False or v == -1:
         return -1
+    if v is True:
+        return None
     return parse_duration(v)

dstack/_internal/server/background/tasks/process_fleets.py CHANGED Viewed

@@ -1,10 +1,11 @@
+from collections import defaultdict
 from datetime import timedelta
 from typing import List
 from uuid import UUID
 from sqlalchemy import select, update
 from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import joinedload, load_only
+from sqlalchemy.orm import joinedload, load_only, selectinload
 from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
 from dstack._internal.core.models.instances import InstanceStatus
@@ -37,30 +38,68 @@ MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
 @sentry_utils.instrument_background_task
 async def process_fleets():
-    lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
+    fleet_lock, fleet_lockset = get_locker(get_db().dialect_name).get_lockset(
+        FleetModel.__tablename__
+    )
+    instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
+        InstanceModel.__tablename__
+    )
     async with get_session_ctx() as session:
-        async with lock:
+        async with fleet_lock, instance_lock:
             res = await session.execute(
                 select(FleetModel)
                 .where(
                     FleetModel.deleted == False,
-                    FleetModel.id.not_in(lockset),
+                    FleetModel.id.not_in(fleet_lockset),
                     FleetModel.last_processed_at
                     < get_current_datetime() - MIN_PROCESSING_INTERVAL,
                 )
-                .options(load_only(FleetModel.id))
+                .options(
+                    load_only(FleetModel.id, FleetModel.name),
+                    selectinload(FleetModel.instances).load_only(InstanceModel.id),
+                )
                 .order_by(FleetModel.last_processed_at.asc())
                 .limit(BATCH_SIZE)
                 .with_for_update(skip_locked=True, key_share=True)
             )
-            fleet_models = list(res.scalars().all())
+            fleet_models = list(res.scalars().unique().all())
             fleet_ids = [fm.id for fm in fleet_models]
+            res = await session.execute(
+                select(InstanceModel)
+                .where(
+                    InstanceModel.id.not_in(instance_lockset),
+                    InstanceModel.fleet_id.in_(fleet_ids),
+                )
+                .options(load_only(InstanceModel.id, InstanceModel.fleet_id))
+                .order_by(InstanceModel.id)
+                .with_for_update(skip_locked=True, key_share=True)
+            )
+            instance_models = list(res.scalars().all())
+            fleet_id_to_locked_instances = defaultdict(list)
+            for instance_model in instance_models:
+                fleet_id_to_locked_instances[instance_model.fleet_id].append(instance_model)
+            # Process only fleets with all instances locked.
+            # Other fleets won't be processed but will still be locked to avoid new transaction.
+            # This should not be problematic as long as process_fleets is quick.
+            fleet_models_to_process = []
+            for fleet_model in fleet_models:
+                if len(fleet_model.instances) == len(fleet_id_to_locked_instances[fleet_model.id]):
+                    fleet_models_to_process.append(fleet_model)
+                else:
+                    logger.debug(
+                        "Fleet %s processing will be skipped: some instance were not locked",
+                        fleet_model.name,
+                    )
             for fleet_id in fleet_ids:
-                lockset.add(fleet_id)
+                fleet_lockset.add(fleet_id)
+            instance_ids = [im.id for im in instance_models]
+            for instance_id in instance_ids:
+                instance_lockset.add(instance_id)
         try:
-            await _process_fleets(session=session, fleet_models=fleet_models)
+            await _process_fleets(session=session, fleet_models=fleet_models_to_process)
         finally:
-            lockset.difference_update(fleet_ids)
+            fleet_lockset.difference_update(fleet_ids)
+            instance_lockset.difference_update(instance_ids)
 async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
@@ -99,8 +138,8 @@ def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: Fleet
         return
     if not _is_fleet_ready_for_consolidation(fleet_model):
         return
-    added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
-    if added_instances:
+    changed_instances = _maintain_fleet_nodes_in_min_max_range(session, fleet_model, fleet_spec)
+    if changed_instances:
         fleet_model.consolidation_attempt += 1
     else:
         # The fleet is already consolidated or consolidation is in progress.
@@ -138,28 +177,47 @@ def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
     return _CONSOLIDATION_RETRY_DELAYS[-1]
-def _maintain_fleet_nodes_min(
+def _maintain_fleet_nodes_in_min_max_range(
     session: AsyncSession,
     fleet_model: FleetModel,
     fleet_spec: FleetSpec,
 ) -> bool:
     """
-    Ensures the fleet has at least `nodes.min` instances.
-    Returns `True` if retried or added new instances and `False` otherwise.
+    Ensures the fleet has at least `nodes.min` and at most `nodes.max` instances.
+    Returns `True` if retried, added new instances, or terminated redundant instances and `False` otherwise.
     """
     assert fleet_spec.configuration.nodes is not None
     for instance in fleet_model.instances:
         # Delete terminated but not deleted instances since
         # they are going to be replaced with new pending instances.
         if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
-            # It's safe to modify instances without instance lock since
-            # no other task modifies already terminated instances.
             instance.deleted = True
             instance.deleted_at = get_current_datetime()
     active_instances = [i for i in fleet_model.instances if not i.deleted]
     active_instances_num = len(active_instances)
     if active_instances_num >= fleet_spec.configuration.nodes.min:
-        return False
+        if (
+            fleet_spec.configuration.nodes.max is None
+            or active_instances_num <= fleet_spec.configuration.nodes.max
+        ):
+            return False
+        # Fleet has more instances than allowed by nodes.max.
+        # This is possible due to race conditions (e.g. provisioning jobs in a fleet concurrently)
+        # or if nodes.max is updated.
+        nodes_redundant = active_instances_num - fleet_spec.configuration.nodes.max
+        for instance in fleet_model.instances:
+            if nodes_redundant == 0:
+                break
+            if instance.status in [InstanceStatus.IDLE]:
+                instance.status = InstanceStatus.TERMINATING
+                instance.termination_reason = "Fleet has too many instances"
+                nodes_redundant -= 1
+                logger.info(
+                    "Terminating instance %s: %s",
+                    instance.name,
+                    instance.termination_reason,
+                )
+        return True
     nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
     for i in range(nodes_missing):
         instance_model = create_fleet_instance_model(

dstack/_internal/server/background/tasks/process_instances.py CHANGED Viewed

@@ -259,9 +259,7 @@ async def _add_remote(instance: InstanceModel) -> None:
     if instance.status == InstanceStatus.PENDING:
         instance.status = InstanceStatus.PROVISIONING
-    retry_duration_deadline = instance.created_at.replace(
-        tzinfo=datetime.timezone.utc
-    ) + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
+    retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
     if retry_duration_deadline < get_current_datetime():
         instance.status = InstanceStatus.TERMINATED
         instance.termination_reason = "Provisioning timeout expired"

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -256,8 +256,8 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
     for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
         replica_statuses: Set[RunStatus] = set()
         replica_needs_retry = False
         replica_active = True
+        jobs_done_num = 0
         for job_model in job_models:
             job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
             if (
@@ -272,8 +272,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             ):
                 # the job is done or going to be done
                 replica_statuses.add(RunStatus.DONE)
-                # for some reason the replica is done, it's not active
-                replica_active = False
+                jobs_done_num += 1
             elif job_model.termination_reason == JobTerminationReason.SCALED_DOWN:
                 # the job was scaled down
                 replica_active = False
@@ -313,26 +312,14 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             if not replica_needs_retry or retry_single_job:
                 run_statuses.update(replica_statuses)
-        if replica_active:
-            # submitted_at = replica created
-            replicas_info.append(
-                autoscalers.ReplicaInfo(
-                    active=True,
-                    timestamp=min(job.submitted_at for job in job_models).replace(
-                        tzinfo=datetime.timezone.utc
-                    ),
-                )
-            )
-        else:
-            # last_processed_at = replica scaled down
-            replicas_info.append(
-                autoscalers.ReplicaInfo(
-                    active=False,
-                    timestamp=max(job.last_processed_at for job in job_models).replace(
-                        tzinfo=datetime.timezone.utc
-                    ),
-                )
-            )
+        if jobs_done_num == len(job_models):
+            # Consider replica inactive if all its jobs are done for some reason.
+            # If only some jobs are done, replica is considered active to avoid
+            # provisioning new replicas for partially done multi-node tasks.
+            replica_active = False
+        replica_info = _get_replica_info(job_models, replica_active)
+        replicas_info.append(replica_info)
     termination_reason: Optional[RunTerminationReason] = None
     if RunStatus.FAILED in run_statuses:
@@ -410,6 +397,23 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             run_model.resubmission_attempt += 1
+def _get_replica_info(
+    replica_job_models: list[JobModel],
+    replica_active: bool,
+) -> autoscalers.ReplicaInfo:
+    if replica_active:
+        # submitted_at = replica created
+        return autoscalers.ReplicaInfo(
+            active=True,
+            timestamp=min(job.submitted_at for job in replica_job_models),
+        )
+    # last_processed_at = replica scaled down
+    return autoscalers.ReplicaInfo(
+        active=False,
+        timestamp=max(job.last_processed_at for job in replica_job_models),
+    )
 async def _handle_run_replicas(
     session: AsyncSession,
     run_model: RunModel,

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -260,7 +260,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
         instance_filters = [
             InstanceModel.deleted == False,
-            InstanceModel.total_blocks > InstanceModel.busy_blocks,
             InstanceModel.id.not_in(detaching_instances_ids),
         ]
@@ -514,9 +513,6 @@ async def _find_optimal_fleet_with_offers(
         )
         return run_model.fleet, fleet_instances_with_pool_offers
-    if len(fleet_models) == 0:
-        return None, []
     nodes_required_num = _get_nodes_required_num_for_run(run_spec)
     # The current strategy is first to consider fleets that can accommodate
     # the run without additional provisioning and choose the one with the cheapest pool offer.
@@ -534,6 +530,7 @@ async def _find_optimal_fleet_with_offers(
         ]
     ] = []
     for candidate_fleet_model in fleet_models:
+        candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
         fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
             fleet_model=candidate_fleet_model,
             run_spec=run_spec,
@@ -541,24 +538,21 @@ async def _find_optimal_fleet_with_offers(
             master_job_provisioning_data=master_job_provisioning_data,
             volumes=volumes,
         )
-        fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
+        fleet_has_pool_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
         fleet_cheapest_pool_offer = math.inf
         if len(fleet_instances_with_pool_offers) > 0:
             fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
-        candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
-        profile = None
-        requirements = None
         try:
+            _check_can_create_new_instance_in_fleet(candidate_fleet)
             profile, requirements = _get_run_profile_and_requirements_in_fleet(
                 job=job,
                 run_spec=run_spec,
                 fleet=candidate_fleet,
             )
         except ValueError:
-            pass
-        fleet_backend_offers = []
-        if profile is not None and requirements is not None:
+            fleet_backend_offers = []
+        else:
             multinode = (
                 candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
                 or job.job_spec.jobs_per_replica > 1
@@ -579,8 +573,12 @@ async def _find_optimal_fleet_with_offers(
         if len(fleet_backend_offers) > 0:
             fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
+        if not _run_can_fit_into_fleet(run_spec, candidate_fleet):
+            logger.debug("Skipping fleet %s from consideration: run cannot fit into fleet")
+            continue
         fleet_priority = (
-            not fleet_has_available_capacity,
+            not fleet_has_pool_capacity,
             fleet_cheapest_pool_offer,
             fleet_cheapest_backend_offer,
         )
@@ -593,10 +591,13 @@ async def _find_optimal_fleet_with_offers(
                 fleet_priority,
             )
         )
+    if len(candidate_fleets_with_offers) == 0:
+        return None, []
     if run_spec.merged_profile.fleets is None and all(
         t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
     ):
-        # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
+        # If fleets are not specified and no fleets have available pool
+        # or backend offers, create a new fleet.
         # This is for compatibility with non-fleet-first UX when runs created new fleets
         # if there are no instances to reuse.
         return None, []
@@ -616,6 +617,39 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
     return nodes_required_num
+def _run_can_fit_into_fleet(run_spec: RunSpec, fleet: Fleet) -> bool:
+    """
+    Returns `False` if the run cannot fit into fleet for sure.
+    This is helpful heuristic to avoid even considering fleets too small for a run.
+    A run may not fit even if this function returns `True`.
+    This will lead to some jobs failing due to exceeding `nodes.max`
+    or more than `nodes.max` instances being provisioned
+    and eventually removed by the fleet consolidation logic.
+    """
+    # No check for cloud fleets with blocks > 1 since we don't know
+    # how many jobs such fleets can accommodate.
+    nodes_required_num = _get_nodes_required_num_for_run(run_spec)
+    if (
+        fleet.spec.configuration.nodes is not None
+        and fleet.spec.configuration.blocks == 1
+        and fleet.spec.configuration.nodes.max is not None
+    ):
+        busy_instances = [i for i in fleet.instances if i.busy_blocks > 0]
+        fleet_available_capacity = fleet.spec.configuration.nodes.max - len(busy_instances)
+        if fleet_available_capacity < nodes_required_num:
+            return False
+    elif fleet.spec.configuration.ssh_config is not None:
+        # Currently assume that each idle block can run a job.
+        # TODO: Take resources / eligible offers into account.
+        total_idle_blocks = 0
+        for instance in fleet.instances:
+            total_blocks = instance.total_blocks or 1
+            total_idle_blocks += total_blocks - instance.busy_blocks
+        if total_idle_blocks < nodes_required_num:
+            return False
+    return True
 def _get_fleet_instances_with_pool_offers(
     fleet_model: FleetModel,
     run_spec: RunSpec,
@@ -713,6 +747,7 @@ async def _run_job_on_new_instance(
     if fleet_model is not None:
         fleet = fleet_model_to_fleet(fleet_model)
         try:
+            _check_can_create_new_instance_in_fleet(fleet)
             profile, requirements = _get_run_profile_and_requirements_in_fleet(
                 job=job,
                 run_spec=run.run_spec,
@@ -787,8 +822,6 @@ def _get_run_profile_and_requirements_in_fleet(
     run_spec: RunSpec,
     fleet: Fleet,
 ) -> tuple[Profile, Requirements]:
-    if not _check_can_create_new_instance_in_fleet(fleet):
-        raise ValueError("Cannot fit new instance into fleet")
     profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
     if profile is None:
         raise ValueError("Cannot combine fleet profile")
@@ -801,13 +834,23 @@ def _get_run_profile_and_requirements_in_fleet(
     return profile, requirements
-def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
+def _check_can_create_new_instance_in_fleet(fleet: Fleet):
+    if not _can_create_new_instance_in_fleet(fleet):
+        raise ValueError("Cannot fit new instance into fleet")
+def _can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
     if fleet.spec.configuration.ssh_config is not None:
         return False
-    # TODO: Respect nodes.max
-    # Ensure concurrent provisioning does not violate nodes.max
-    # E.g. lock fleet and split instance model creation
-    # and instance provisioning into separate transactions.
+    active_instances = [i for i in fleet.instances if i.status.is_active()]
+    # nodes.max is a soft limit that can be exceeded when provisioning concurrently.
+    # The fleet consolidation logic will remove redundant nodes eventually.
+    if (
+        fleet.spec.configuration.nodes is not None
+        and fleet.spec.configuration.nodes.max is not None
+        and len(active_instances) >= fleet.spec.configuration.nodes.max
+    ):
+        return False
     return True

dstack/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.19.31"
+__version__ = "0.19.32"
 __is_release__ = True
 base_image = "0.11rc2"
 base_image_ubuntu_version = "22.04"

{dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dstack
-Version: 0.19.31
+Version: 0.19.32
 Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
 Project-URL: Homepage, https://dstack.ai
 Project-URL: Source, https://github.com/dstackai/dstack
@@ -73,7 +73,7 @@ Requires-Dist: grpcio>=1.50; extra == 'all'
 Requires-Dist: httpx; extra == 'all'
 Requires-Dist: jinja2; extra == 'all'
 Requires-Dist: kubernetes; extra == 'all'
-Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'all'
+Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'all'
 Requires-Dist: oci>=2.150.0; extra == 'all'
 Requires-Dist: prometheus-client; extra == 'all'
 Requires-Dist: pyopenssl>=23.2.0; extra == 'all'
@@ -259,7 +259,7 @@ Requires-Dist: fastapi; extra == 'nebius'
 Requires-Dist: grpcio>=1.50; extra == 'nebius'
 Requires-Dist: httpx; extra == 'nebius'
 Requires-Dist: jinja2; extra == 'nebius'
-Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
+Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
 Requires-Dist: prometheus-client; extra == 'nebius'
 Requires-Dist: python-dxf==12.1.0; extra == 'nebius'
 Requires-Dist: python-json-logger>=3.1.0; extra == 'nebius'
@@ -340,15 +340,13 @@ It streamlines development, training, and inference, and is compatible with any
 `dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
 ## Latest news ✨
-- [2025/09] [dstack 0.19.27: Offers UI, Digital Ocean and AMD Developer Cloud](https://github.com/dstackai/dstack/releases/tag/0.19.27)
-- [2025/08] [dstack 0.19.26: Repos – explicit repo configuration via YAML](https://github.com/dstackai/dstack/releases/tag/0.19.26)
-- [2025/08] [dstack 0.19.25: `dstack offer` CLI command](https://github.com/dstackai/dstack/releases/tag/0.19.25)
-- [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy, Secrets UI](https://github.com/dstackai/dstack/releases/tag/0.19.22)
+- [2025/10] [dstack 0.19.31: Kubernetes, GCP A4 spot](https://github.com/dstackai/dstack/releases/tag/0.19.31)
+- [2025/08] [dstack 0.19.26: Repos](https://github.com/dstackai/dstack/releases/tag/0.19.26)
+- [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy](https://github.com/dstackai/dstack/releases/tag/0.19.22)
 - [2025/07] [dstack 0.19.21: Scheduled tasks](https://github.com/dstackai/dstack/releases/tag/0.19.21)
 - [2025/07] [dstack 0.19.17: Secrets, Files, Rolling deployment](https://github.com/dstackai/dstack/releases/tag/0.19.17)
-- [2025/06] [dstack 0.19.16: Docker in Docker, CloudRift](https://github.com/dstackai/dstack/releases/tag/0.19.16)
-- [2025/06] [dstack 0.19.13: InfiniBand support in default images](https://github.com/dstackai/dstack/releases/tag/0.19.13)
-- [2025/06] [dstack 0.19.12: Simplified use of MPI](https://github.com/dstackai/dstack/releases/tag/0.19.12)
+- [2025/06] [dstack 0.19.16: Docker in Docker](https://github.com/dstackai/dstack/releases/tag/0.19.16)
+- [2025/06] [dstack 0.19.13: Default images with InfiniBand support](https://github.com/dstackai/dstack/releases/tag/0.19.13)
 ## How does it work?
@@ -364,11 +362,11 @@ It streamlines development, training, and inference, and is compatible with any
 To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends.
-Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](../concepts/projects.md#backends) in the UI.
+Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](https://dstack.ai/docs/concepts/projects#backends) in the UI.
-For more details, see [Backends](../concepts/backends.md).
+For more details, see [Backends](https://dstack.ai/docs/concepts/backends).
-> When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](../concepts/fleets.md#ssh) once the server is up.
+> When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh) once the server is up.
 ##### Start the server

{dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-dstack/version.py,sha256=bfw1WiD5UTLEsyy2XkGQGNyKevTEg-OnV98FYK9gm7Q,105
+dstack/version.py,sha256=DLiOZq8Gabr_DjHGIzjxI9IasDON-4xNaF3b4Rt2BBI,105
 dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
 dstack/_internal/settings.py,sha256=FYtd7tRk17Oc62Kl_3O8NuT5JHb8TKhLThl1TsfjjVs,1390
@@ -118,12 +118,12 @@ dstack/_internal/core/backends/gcp/features/tcpx.py,sha256=8bDR5kwF5qke5EWNdBscd
 dstack/_internal/core/backends/hotaisle/__init__.py,sha256=CYMaS1jd9Km0Y6Jvg4ePjYOtfqL9swGsRo5kcXGFrFQ,30
 dstack/_internal/core/backends/hotaisle/api_client.py,sha256=Fd1TOg4_orwQyJtoZ657zJweLeBhzj_9ObfL538S5uI,3640
 dstack/_internal/core/backends/hotaisle/backend.py,sha256=o0cqLIKGcrXhvksHHGvjCpLShoQxT2IKdJy9sm0H9gE,586
-dstack/_internal/core/backends/hotaisle/compute.py,sha256=y72Mmzhq2xVGc5tKK3k7_ovog8_vXVngfLhcIvH-p2I,7551
+dstack/_internal/core/backends/hotaisle/compute.py,sha256=X9XbIatbFH5wqLoSH3Z9nNOhBMrXnVVayFn6xi4zu-g,8224
 dstack/_internal/core/backends/hotaisle/configurator.py,sha256=EJwdKFfC0ab0pe4lzeV65b80Ok21rR0OfupOmuqCp6c,2287
 dstack/_internal/core/backends/hotaisle/models.py,sha256=CmJ20SbpKzFldX7rrR0CpVytSJSN2YWKQ3Ixnta_A1M,1334
 dstack/_internal/core/backends/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dstack/_internal/core/backends/kubernetes/backend.py,sha256=Jy0_Nwn6Oro8McJIo_QeNxxq4Pmwsd7JPd5_YE8Fz9U,606
-dstack/_internal/core/backends/kubernetes/compute.py,sha256=AiZk5uWtON-QXyi4iVC1InmkNmtUhE6mJXWfCKF8KG0,32428
+dstack/_internal/core/backends/kubernetes/compute.py,sha256=7xVdo2HK-dTZjkQQtuIfiTXLhBSzhUO0BomZBaPG5UM,34989
 dstack/_internal/core/backends/kubernetes/configurator.py,sha256=RK8_eznv1AFrcG3fM-KIxyolsaJ8UTBAO7c3P3RCBnw,2228
 dstack/_internal/core/backends/kubernetes/models.py,sha256=vGOhRYP4OzhF62BN5bfRGd4E2tKPaqdlZY8tMmjZoJ0,2308
 dstack/_internal/core/backends/kubernetes/utils.py,sha256=1DkkL_VWShFFqN-Crh0ddebRyXXyL435FyrjVkFLR1Q,6286
@@ -138,11 +138,11 @@ dstack/_internal/core/backends/local/backend.py,sha256=KJuNXUXrg60NhLywnExD1EXH2
 dstack/_internal/core/backends/local/compute.py,sha256=tWNsKGKYlPx9yeqwlpAL_XExOYMPLcb6AsGAji3YO3M,3825
 dstack/_internal/core/backends/nebius/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dstack/_internal/core/backends/nebius/backend.py,sha256=2XqZIbSR8VzlfOnuVklXlDxNmwAkQj7txQN8VXF1j2E,566
-dstack/_internal/core/backends/nebius/compute.py,sha256=bBfNai_GkrHzWHnRRnBkUObQxV4aD_Fog9eQiaPL0Kw,14920
-dstack/_internal/core/backends/nebius/configurator.py,sha256=PilZ5M0xj-koYz9PPew9L29rrHoF2JrW2bxgCqt27u4,3213
+dstack/_internal/core/backends/nebius/compute.py,sha256=US5W8Q0UT09huQybDTBLVE2EDpRG3UdMqq1DZFFaCI4,15454
+dstack/_internal/core/backends/nebius/configurator.py,sha256=eybolJi5rlEeU8GBXC7pdOU7To32ASQGHDAiE2cNeFo,3794
 dstack/_internal/core/backends/nebius/fabrics.py,sha256=-X-nSPV2pUin2PAYDHGm-j14KPboIFRpLi93PKHUXTM,1616
-dstack/_internal/core/backends/nebius/models.py,sha256=UudYX32p-ZY-GWR83VEtY5dpZBaWhKXQIfn2nrBCq-4,4245
-dstack/_internal/core/backends/nebius/resources.py,sha256=ttgwdqokvXF8BH_IDPFZxWqr1uAMpdO3_Q31VleiXvk,12731
+dstack/_internal/core/backends/nebius/models.py,sha256=OSiUANBf893Xdm7-4WDoPfmd3YFk5-oRjdXiWUjvDdk,6194
+dstack/_internal/core/backends/nebius/resources.py,sha256=MVyMaS0-mZu2g-tJ4HF7GiT1hFFL7Mha9hXtP3XeT7o,14070
 dstack/_internal/core/backends/oci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dstack/_internal/core/backends/oci/auth.py,sha256=8Cr18y_LOsyRP-16yfFpT70Cofpm0clB3KawS_7aRl4,717
 dstack/_internal/core/backends/oci/backend.py,sha256=yXjVCt7n6BVLH0byYFbNFf-P9J0FwlNfxsYbKGMdoI4,536
@@ -182,7 +182,7 @@ dstack/_internal/core/compatibility/fleets.py,sha256=jg42A7OmprqATKKt6JpLL1qOQSZ
 dstack/_internal/core/compatibility/gateways.py,sha256=4h_lfpN9KJFyLTFexq-wlu74Rwpk0anV67v38aJ-SnI,1463
 dstack/_internal/core/compatibility/gpus.py,sha256=myWVUjaK2S1QuYgRZyMtD8-DPKQTjadSbsnECtfoHHs,575
 dstack/_internal/core/compatibility/logs.py,sha256=keXt3OFKR0CjD_XMsetzRu8yQGCz7CWBwycuP267L_Q,629
-dstack/_internal/core/compatibility/runs.py,sha256=kdYvirgXWO2m9Uj_LOadzMUsJSWXtFY6yEdNadNypZs,8869
+dstack/_internal/core/compatibility/runs.py,sha256=pT5RxheOzJJXFpJM1th-ku9-inj3McMBcdEHxcMBp9U,9357
 dstack/_internal/core/compatibility/volumes.py,sha256=ofjpVusuc-pq285bGrIh8PAqu0QlAd6NQgU3gfJQIc0,1546
 dstack/_internal/core/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dstack/_internal/core/models/common.py,sha256=QKdZM7L2NepzOPavkkI_q3g6WYCauMTbtZSZwWZVYHE,4704
@@ -197,7 +197,7 @@ dstack/_internal/core/models/instances.py,sha256=Gpv46fu3uWO-3f8w1A6rBzU5dYhmO_w
 dstack/_internal/core/models/logs.py,sha256=VOsgEsvUIRNNHivD6OZnPZNC52ioqafv7ccdnFQ1YI8,529
 dstack/_internal/core/models/metrics.py,sha256=Xb8hCXUL-ncQ3PMsErIUAJTe9gwh5jyrQ4UQoZbibsc,269
 dstack/_internal/core/models/placement.py,sha256=WJVq5ENJykyRarQzL2EeYQag_9_jV7VSAtR_xoFvPVM,720
-dstack/_internal/core/models/profiles.py,sha256=_ZxSk-rMvzpiUYUZD0EQbrJDy3FVmRUmIUtEgzlVeHo,14217
+dstack/_internal/core/models/profiles.py,sha256=YW7XeztKBnVCptvUyR8-E-dbB06nu9GIqwLvmtHgSms,14501
 dstack/_internal/core/models/projects.py,sha256=hOZoL85q-873vT_Aw7FhzpS6DGVt0Y3yT8kpElrLFto,833
 dstack/_internal/core/models/resources.py,sha256=-dLupzud5BSqxNABBjLVYTCKekbr9_mhaNGD1ZWBjgM,14544
 dstack/_internal/core/models/runs.py,sha256=ehGSyCSx5OAaEqCEd2YvCRiP_uewUeDOHbJGSocu6w0,22609
@@ -289,17 +289,17 @@ dstack/_internal/server/settings.py,sha256=7SRzSlTnUPNNjlZH-vpgwbwj95gI-LLtQite8
 dstack/_internal/server/background/__init__.py,sha256=QftEjgQZffu83sY-F0WL65vRp28FbBEtezfowQYcTv4,5606
 dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 dstack/_internal/server/background/tasks/common.py,sha256=n87hFjDNtS2x8mYbBnKLqhXus1P8qkdfqXG1TSeIJjM,1089
-dstack/_internal/server/background/tasks/process_fleets.py,sha256=b0wECmweaetFv9vSjE1BiOJmlIPlk7O-fmntpdSiWgg,8590
+dstack/_internal/server/background/tasks/process_fleets.py,sha256=0i_S3HCZp4AjQjER7j_pvIm22eYbrBwZnt9-0kgsq3E,11547
 dstack/_internal/server/background/tasks/process_gateways.py,sha256=FH9RY3Tfmtw_UctCdYZDIRb2rgtmHdxTg6Oc4IBiDBA,8356
 dstack/_internal/server/background/tasks/process_idle_volumes.py,sha256=mqnl8wvWaKTYvJMbgFJbOP-bZMRQG2vrhUnaNcyBldE,5223
-dstack/_internal/server/background/tasks/process_instances.py,sha256=siltiaMbm1zUi4r0CwUODj_8iRx5twJpVAp4cP05n6w,44059
+dstack/_internal/server/background/tasks/process_instances.py,sha256=IeKl28NSC-va5QABW0PC3eDGU6r-Xz1TNuSK2EtIX08,44007
 dstack/_internal/server/background/tasks/process_metrics.py,sha256=yKXe9J7m3oleK0C-oGJaYNkcPT8kqkz0nw-A7xqYbjE,6390
 dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=lgYIzjHG9EITK31yG6uQjlIcSwW5jsP9ZOBBZqW_eNs,4263
 dstack/_internal/server/background/tasks/process_probes.py,sha256=dmug-_rmYiVLLF-imto-Ju1gPtENvHvCjHyilqgYuJw,6457
 dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=_UZm37FVV4rhdd0So7HtcKbIgrSdAr5Vx-Uen_xizec,5459
 dstack/_internal/server/background/tasks/process_running_jobs.py,sha256=IoQi7mm4upEZgujTkWYrXDKrC5rSZ5Q4_jAR4OpajaM,44973
-dstack/_internal/server/background/tasks/process_runs.py,sha256=Cx7Z1B7pZVlvCl-OsIaAiIMFG_aZDdn3nlZeha6k2x4,25041
-dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=XxPapMdCsuA_H_X27SIwIZFd0Y5jzwvIABnhqa-kwyQ,41098
+dstack/_internal/server/background/tasks/process_runs.py,sha256=K4km4XT0JYUf6JYbpKbEAyumUDBT21lqcMFTQ7pIsoY,25200
+dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=fDuLfnSJqWDL2oPrxXQMkaqKgplMdKT0SRD_AyB4n-0,43099
 dstack/_internal/server/background/tasks/process_terminating_jobs.py,sha256=S7ZSDVMX-N0XMaMgwFa1QG_RAi48BP432s9AqHw4PMM,4066
 dstack/_internal/server/background/tasks/process_volumes.py,sha256=_fMmkwLYsyX-kpW9pDrZVJvFTZEOPp0gpjyKBMW-zw0,5204
 dstack/_internal/server/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -662,8 +662,8 @@ dstack/plugins/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
 dstack/plugins/builtin/rest_plugin/__init__.py,sha256=lgTsq8Z6Km2F2UhPRChVB4vDM5ZpWtdk1iB1aa20ypA,440
 dstack/plugins/builtin/rest_plugin/_models.py,sha256=9hgVuU6OGSxidar88XhQnNo9izYWeQvVH45ciErv-Es,1910
 dstack/plugins/builtin/rest_plugin/_plugin.py,sha256=h3r3Yc3h22i93fifPTgTm9Oojd1sN1O4DP7ZTV-kWpM,5386
-dstack-0.19.31.dist-info/METADATA,sha256=espPx6ZPYMP95O6EOFv7PmhkMdmjxCTn-yptJZxPER4,21085
-dstack-0.19.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-dstack-0.19.31.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
-dstack-0.19.31.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
-dstack-0.19.31.dist-info/RECORD,,
+dstack-0.19.32.dist-info/METADATA,sha256=R_c6-NfPoaFBeuJVdHTD6dENAKZZsBo_syd6wCdiJ6M,20834
+dstack-0.19.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+dstack-0.19.32.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
+dstack-0.19.32.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
+dstack-0.19.32.dist-info/RECORD,,

{dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/WHEEL RENAMED Viewed

File without changes

{dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{dstack-0.19.31.dist-info → dstack-0.19.32.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

dstack 0.19.31__py3-none-any.whl → 0.19.32__py3-none-any.whl

Potentially problematic release.

dstack 0.19.31py3-none-any.whl → 0.19.32py3-none-any.whl