PyPI - dstack - Versions diffs - 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl - Mend

dstack 0.19.7py3-none-any.whl → 0.19.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (60) hide show

dstack/_internal/cli/services/args.py +2 -2
dstack/_internal/cli/services/configurators/run.py +56 -13
dstack/_internal/cli/utils/run.py +10 -5
dstack/_internal/core/backends/aws/compute.py +13 -1
dstack/_internal/core/backends/azure/compute.py +42 -13
dstack/_internal/core/backends/azure/configurator.py +21 -0
dstack/_internal/core/backends/azure/models.py +9 -0
dstack/_internal/core/backends/base/compute.py +101 -27
dstack/_internal/core/backends/base/offers.py +13 -3
dstack/_internal/core/backends/cudo/compute.py +3 -1
dstack/_internal/core/backends/datacrunch/compute.py +2 -0
dstack/_internal/core/backends/gcp/auth.py +1 -1
dstack/_internal/core/backends/gcp/compute.py +51 -35
dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
dstack/_internal/core/backends/local/compute.py +2 -0
dstack/_internal/core/backends/nebius/compute.py +95 -1
dstack/_internal/core/backends/nebius/configurator.py +11 -0
dstack/_internal/core/backends/nebius/fabrics.py +48 -0
dstack/_internal/core/backends/nebius/models.py +9 -1
dstack/_internal/core/backends/nebius/resources.py +29 -0
dstack/_internal/core/backends/oci/compute.py +2 -0
dstack/_internal/core/backends/remote/provisioning.py +27 -2
dstack/_internal/core/backends/template/compute.py.jinja +2 -0
dstack/_internal/core/backends/tensordock/compute.py +2 -0
dstack/_internal/core/backends/vultr/compute.py +5 -1
dstack/_internal/core/models/instances.py +2 -1
dstack/_internal/core/models/resources.py +79 -4
dstack/_internal/core/models/runs.py +26 -9
dstack/_internal/core/models/volumes.py +1 -1
dstack/_internal/server/background/tasks/process_fleets.py +4 -13
dstack/_internal/server/background/tasks/process_instances.py +176 -55
dstack/_internal/server/background/tasks/process_metrics.py +26 -9
dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
dstack/_internal/server/models.py +6 -1
dstack/_internal/server/schemas/runner.py +41 -8
dstack/_internal/server/services/fleets.py +9 -26
dstack/_internal/server/services/instances.py +0 -2
dstack/_internal/server/services/jobs/__init__.py +1 -0
dstack/_internal/server/services/offers.py +15 -0
dstack/_internal/server/services/placement.py +27 -6
dstack/_internal/server/services/resources.py +21 -0
dstack/_internal/server/services/runner/client.py +7 -4
dstack/_internal/server/services/runs.py +18 -8
dstack/_internal/server/settings.py +20 -1
dstack/_internal/server/testing/common.py +37 -26
dstack/_internal/utils/common.py +13 -1
dstack/_internal/utils/json_schema.py +6 -3
dstack/api/__init__.py +1 -0
dstack/api/server/_fleets.py +16 -0
dstack/api/server/_runs.py +48 -3
dstack/version.py +1 -1
{dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
{dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
{dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
{dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
{dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/core/backends/remote/provisioning.py CHANGED Viewed

@@ -6,8 +6,9 @@ from textwrap import dedent
 from typing import Any, Dict, Generator, List, Optional
 import paramiko
-from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
+from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
+from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
 from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
 # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
@@ -36,6 +37,22 @@ DSTACK_SHIM_ENV_FILE = "shim.env"
 HOST_INFO_FILE = "host_info.json"
+def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType:
+    cmd = "uname -m"
+    try:
+        _, stdout, stderr = client.exec_command(cmd, timeout=20)
+    except (paramiko.SSHException, OSError) as e:
+        raise ProvisioningError(f"detect_cpu_arch: {e}") from e
+    out = stdout.read().strip().decode()
+    err = stderr.read().strip().decode()
+    if err:
+        raise ProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}")
+    try:
+        return normalize_arch(out)
+    except ValueError as e:
+        raise ProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e
 def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None:
     try:
         sftp = client.open_sftp()
@@ -226,7 +243,14 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str:
             raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
-def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
+def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
+    _cpu_arch: CPUArchitecture
+    if cpu_arch == "amd64":
+        _cpu_arch = CPUArchitecture.X86
+    elif cpu_arch == "arm64":
+        _cpu_arch = CPUArchitecture.ARM
+    else:
+        raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
     gpu_count = host_info.get("gpu_count", 0)
     if gpu_count > 0:
         gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
@@ -251,6 +275,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
     instance_type = InstanceType(
         name="instance",
         resources=Resources(
+            cpu_arch=_cpu_arch,
             cpus=host_info["cpus"],
             memory_mib=host_info["memory"] / 1024 / 1024,
             spot=False,

dstack/_internal/core/backends/template/compute.py.jinja CHANGED Viewed

@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
     InstanceConfiguration,
     InstanceOfferWithAvailability,
 )
+from dstack._internal.core.models.placement import PlacementGroup
 from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
 from dstack._internal.core.models.volumes import Volume
 from dstack._internal.utils.logging import get_logger
@@ -64,6 +65,7 @@ class {{ backend_name }}Compute(
         self,
         instance_offer: InstanceOfferWithAvailability,
         instance_config: InstanceConfiguration,
+        placement_group: Optional[PlacementGroup],
     ) -> JobProvisioningData:
         # TODO: Implement if backend supports creating instances (VM-based).
         # Delete if backend can only run jobs (container-based).

dstack/_internal/core/backends/tensordock/compute.py CHANGED Viewed

@@ -19,6 +19,7 @@ from dstack._internal.core.models.instances import (
     InstanceConfiguration,
     InstanceOfferWithAvailability,
 )
+from dstack._internal.core.models.placement import PlacementGroup
 from dstack._internal.core.models.runs import JobProvisioningData, Requirements
 from dstack._internal.utils.logging import get_logger
@@ -57,6 +58,7 @@ class TensorDockCompute(
         self,
         instance_offer: InstanceOfferWithAvailability,
         instance_config: InstanceConfiguration,
+        placement_group: Optional[PlacementGroup],
     ) -> JobProvisioningData:
         instance_name = generate_unique_instance_name(
             instance_config, max_length=MAX_INSTANCE_NAME_LEN

dstack/_internal/core/backends/vultr/compute.py CHANGED Viewed

@@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import (
     InstanceOffer,
     InstanceOfferWithAvailability,
 )
+from dstack._internal.core.models.placement import PlacementGroup
 from dstack._internal.core.models.runs import JobProvisioningData, Requirements
 from dstack._internal.utils.logging import get_logger
@@ -58,7 +59,10 @@ class VultrCompute(
         return offers
     def create_instance(
-        self, instance_offer: InstanceOfferWithAvailability, instance_config: InstanceConfiguration
+        self,
+        instance_offer: InstanceOfferWithAvailability,
+        instance_config: InstanceConfiguration,
+        placement_group: Optional[PlacementGroup],
     ) -> JobProvisioningData:
         instance_name = generate_unique_instance_name(
             instance_config, max_length=MAX_INSTANCE_NAME_LEN

dstack/_internal/core/models/instances.py CHANGED Viewed

@@ -49,11 +49,13 @@ class Resources(CoreModel):
     spot: bool
     disk: Disk = Disk(size_mib=102400)  # the default value (100GB) for backward compatibility
     description: str = ""
+    cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
     def pretty_format(self, include_spot: bool = False) -> str:
         resources = {}
         if self.cpus > 0:
             resources["cpus"] = self.cpus
+            resources["cpu_arch"] = self.cpu_arch
         if self.memory_mib > 0:
             resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
         if self.disk.size_mib > 0:
@@ -105,7 +107,6 @@ class InstanceConfiguration(CoreModel):
     user: str  # dstack user name
     ssh_keys: List[SSHKey]
     instance_id: Optional[str] = None
-    placement_group_name: Optional[str] = None
     reservation: Optional[str] = None
     volumes: Optional[List[Volume]] = None
     tags: Optional[Dict[str, str]] = None

dstack/_internal/core/models/resources.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import math
+from collections.abc import Mapping
 from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar, Union
 import gpuhunt
-from pydantic import Field, root_validator, validator
+from pydantic import Field, parse_obj_as, root_validator, validator
 from pydantic.generics import GenericModel
 from typing_extensions import Annotated
@@ -125,7 +126,68 @@ class ComputeCapability(Tuple[int, int]):
 DEFAULT_CPU_COUNT = Range[int](min=2)
 DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
-DEFAULT_GPU_COUNT = Range[int](min=1, max=1)
+DEFAULT_GPU_COUNT = Range[int](min=1)
+class CPUSpec(CoreModel):
+    class Config:
+        @staticmethod
+        def schema_extra(schema: Dict[str, Any]):
+            add_extra_schema_types(
+                schema["properties"]["count"],
+                extra_types=[{"type": "integer"}, {"type": "string"}],
+            )
+    arch: Annotated[
+        Optional[gpuhunt.CPUArchitecture],
+        Field(description="The CPU architecture, one of: `x86`, `arm`"),
+    ] = None
+    count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
+    @classmethod
+    def __get_validators__(cls):
+        yield cls.parse
+        yield cls.validate
+    @classmethod
+    def parse(cls, v: Any) -> Any:
+        if isinstance(v, int):
+            v = str(v)
+        if isinstance(v, str):
+            tokens = v.replace(" ", "").split(":")
+            spec = {}
+            for token in tokens:
+                if not token:
+                    raise ValueError(f"CPU spec contains empty token: {v}")
+                if ".." in token or token.isdigit():
+                    if "count" in spec:
+                        raise ValueError(f"CPU spec count conflict: {v}")
+                    spec["count"] = token
+                else:
+                    try:
+                        arch = gpuhunt.CPUArchitecture.cast(token)
+                    except ValueError:
+                        raise ValueError(f"Invalid CPU architecture: {v}")
+                    if "arch" in spec:
+                        raise ValueError(f"CPU spec arch conflict: {v}")
+                    spec["arch"] = arch
+            return spec
+        # Range and min/max dict - for backward compatibility
+        if isinstance(v, Range):
+            return {"arch": None, "count": v}
+        if isinstance(v, Mapping) and v.keys() == {"min", "max"}:
+            return {"arch": None, "count": v}
+        return v
+    @validator("arch", pre=True)
+    def _validate_arch(cls, v: Any) -> Any:
+        if v is None:
+            return None
+        if isinstance(v, gpuhunt.CPUArchitecture):
+            return v
+        if isinstance(v, str):
+            return gpuhunt.CPUArchitecture.cast(v)
+        return v
 class GPUSpec(CoreModel):
@@ -302,7 +364,10 @@ class ResourcesSpec(CoreModel):
                 extra_types=[{"type": "integer"}, {"type": "string"}],
             )
-    cpu: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
+    # TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
+    cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
+        CPUSpec()
+    )
     memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
         DEFAULT_MEMORY_SIZE
     )
@@ -317,8 +382,18 @@ class ResourcesSpec(CoreModel):
     gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
     disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
+    # TODO: Remove in 0.20. Added for backward compatibility.
+    @root_validator
+    def _post_validate(cls, values):
+        cpu = values.get("cpu")
+        if isinstance(cpu, CPUSpec) and cpu.arch in [None, gpuhunt.CPUArchitecture.X86]:
+            values["cpu"] = cpu.count
+        return values
     def pretty_format(self) -> str:
-        resources: Dict[str, Any] = dict(cpus=self.cpu, memory=self.memory)
+        # TODO: Remove in 0.20. Use self.cpu directly
+        cpu = parse_obj_as(CPUSpec, self.cpu)
+        resources: Dict[str, Any] = dict(cpu_arch=cpu.arch, cpus=cpu.count, memory=self.memory)
         if self.gpu:
             gpu = self.gpu
             resources.update(

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
     # Set by the server
     FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
     INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
+    INSTANCE_UNREACHABLE = "instance_unreachable"
     WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
     WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
     TERMINATED_BY_USER = "terminated_by_user"
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
         mapping = {
             self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
             self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
+            self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
             self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
             self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
             self.TERMINATED_BY_USER: JobStatus.TERMINATED,
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
     # or not applicable (container-based backends)
     ports: Optional[dict[int, int]] = None
     # List of volumes used by the job
-    volume_names: Optional[list[str]] = None  # None for backward compalibility
+    volume_names: Optional[list[str]] = None  # None for backward compatibility
     # Virtual shared offer
-    offer: Optional[InstanceOfferWithAvailability] = None  # None for backward compalibility
+    offer: Optional[InstanceOfferWithAvailability] = None  # None for backward compatibility
 class ClusterInfo(CoreModel):
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
     status: JobStatus
     termination_reason: Optional[JobTerminationReason]
     termination_reason_message: Optional[str]
+    exit_status: Optional[int]
     job_provisioning_data: Optional[JobProvisioningData]
     job_runtime_data: Optional[JobRuntimeData]
@@ -439,9 +442,14 @@ class Run(CoreModel):
     @root_validator
     def _error(cls, values) -> Dict:
+        try:
+            termination_reason = values["termination_reason"]
+            jobs = values["jobs"]
+        except KeyError:
+            return values
         values["error"] = _get_run_error(
-            run_termination_reason=values["termination_reason"],
-            run_jobs=values["jobs"],
+            run_termination_reason=termination_reason,
+            run_jobs=jobs,
         )
         return values
@@ -503,7 +511,9 @@ def _get_run_error(
         return ""
     if len(run_jobs) > 1:
         return run_termination_reason.name
-    run_job_termination_reason = _get_run_job_termination_reason(run_jobs)
+    run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
+        run_jobs
+    )
     # For failed runs, also show termination reason to provide more context.
     # For other run statuses, the job termination reason will duplicate run status.
     if run_job_termination_reason is not None and run_termination_reason in [
@@ -511,13 +521,20 @@ def _get_run_error(
         RunTerminationReason.SERVER_ERROR,
         RunTerminationReason.RETRY_LIMIT_EXCEEDED,
     ]:
+        if exit_status:
+            return (
+                f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
+            )
         return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
     return run_termination_reason.name
-def _get_run_job_termination_reason(run_jobs: List[Job]) -> Optional[JobTerminationReason]:
+def _get_run_job_termination_reason_and_exit_status(
+    run_jobs: List[Job],
+) -> tuple[Optional[JobTerminationReason], Optional[int]]:
     for job in run_jobs:
         if len(job.job_submissions) > 0:
-            if job.job_submissions[-1].termination_reason is not None:
-                return job.job_submissions[-1].termination_reason
-    return None
+            job_submission = job.job_submissions[-1]
+            if job_submission.termination_reason is not None:
+                return job_submission.termination_reason, job_submission.exit_status
+    return None, None

dstack/_internal/core/models/volumes.py CHANGED Viewed

@@ -159,7 +159,7 @@ class VolumeMountPoint(CoreModel):
             description=(
                 "The network volume name or the list of network volume names to mount."
                 " If a list is specified, one of the volumes in the list will be mounted."
-                " Specify volumes from different backends/regions to increase availability."
+                " Specify volumes from different backends/regions to increase availability"
             )
         ),
     ]

dstack/_internal/server/background/tasks/process_fleets.py CHANGED Viewed

@@ -1,15 +1,16 @@
-from sqlalchemy import select, update
+from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
 from dstack._internal.core.models.fleets import FleetStatus
 from dstack._internal.server.db import get_session_ctx
-from dstack._internal.server.models import FleetModel, PlacementGroupModel
+from dstack._internal.server.models import FleetModel
 from dstack._internal.server.services.fleets import (
     is_fleet_empty,
     is_fleet_in_use,
 )
 from dstack._internal.server.services.locking import get_locker
+from dstack._internal.server.services.placement import schedule_fleet_placement_groups_deletion
 from dstack._internal.utils.common import get_current_datetime
 from dstack._internal.utils.logging import get_logger
@@ -68,16 +69,6 @@ async def _autodelete_fleet(session: AsyncSession, fleet_model: FleetModel):
     fleet_model.status = FleetStatus.TERMINATED
     fleet_model.deleted = True
     fleet_model.last_processed_at = get_current_datetime()
-    await _mark_placement_groups_as_ready_for_deletion(session=session, fleet_model=fleet_model)
+    await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
     await session.commit()
     logger.info("Fleet %s deleted", fleet_model.name)
-async def _mark_placement_groups_as_ready_for_deletion(
-    session: AsyncSession, fleet_model: FleetModel
-):
-    await session.execute(
-        update(PlacementGroupModel)
-        .where(PlacementGroupModel.fleet_id == fleet_model.id)
-        .values(fleet_deleted=True)
-    )

dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl

Potentially problematic release.

dstack 0.19.7py3-none-any.whl → 0.19.9py3-none-any.whl