PyPI - dstack - Versions diffs - 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl - Mend

dstack 0.19.20py3-none-any.whl → 0.19.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show

dstack/_internal/core/backends/hotaisle/compute.py ADDED Viewed

@@ -0,0 +1,225 @@
+import shlex
+import subprocess
+import tempfile
+from threading import Thread
+from typing import List, Optional
+import gpuhunt
+from gpuhunt.providers.hotaisle import HotAisleProvider
+from dstack._internal.core.backends.base.compute import (
+    Compute,
+    ComputeWithCreateInstanceSupport,
+    get_shim_commands,
+)
+from dstack._internal.core.backends.base.offers import get_catalog_offers
+from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
+from dstack._internal.core.backends.hotaisle.models import HotAisleConfig
+from dstack._internal.core.models.backends.base import BackendType
+from dstack._internal.core.models.common import CoreModel
+from dstack._internal.core.models.instances import (
+    InstanceAvailability,
+    InstanceConfiguration,
+    InstanceOfferWithAvailability,
+)
+from dstack._internal.core.models.placement import PlacementGroup
+from dstack._internal.core.models.runs import JobProvisioningData, Requirements
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
+MAX_INSTANCE_NAME_LEN = 60
+INSTANCE_TYPE_SPECS = {
+    "1x MI300X 8x Xeon Platinum 8462Y+": {
+        "cpu_model": "Xeon Platinum 8462Y+",
+        "cpu_frequency": 2800000000,
+        "cpu_manufacturer": "Intel",
+    },
+    "1x MI300X 13x Xeon Platinum 8470": {
+        "cpu_model": "Xeon Platinum 8470",
+        "cpu_frequency": 2000000000,
+        "cpu_manufacturer": "Intel",
+    },
+}
+class HotAisleCompute(
+    ComputeWithCreateInstanceSupport,
+    Compute,
+):
+    def __init__(self, config: HotAisleConfig):
+        super().__init__()
+        self.config = config
+        self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle)
+        self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False)
+        self.catalog.add_provider(
+            HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
+        )
+    def get_offers(
+        self, requirements: Optional[Requirements] = None
+    ) -> List[InstanceOfferWithAvailability]:
+        offers = get_catalog_offers(
+            backend=BackendType.HOTAISLE,
+            locations=self.config.regions or None,
+            requirements=requirements,
+            catalog=self.catalog,
+        )
+        supported_offers = []
+        for offer in offers:
+            if offer.instance.name in INSTANCE_TYPE_SPECS:
+                supported_offers.append(
+                    InstanceOfferWithAvailability(
+                        **offer.dict(), availability=InstanceAvailability.AVAILABLE
+                    )
+                )
+            else:
+                logger.warning(
+                    f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
+                )
+        return supported_offers
+    def get_payload_from_offer(self, instance_type) -> dict:
+        instance_type_name = instance_type.name
+        cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
+        cpu_cores = instance_type.resources.cpus
+        return {
+            "cpu_cores": cpu_cores,
+            "cpus": {
+                "count": 1,
+                "manufacturer": cpu_specs["cpu_manufacturer"],
+                "model": cpu_specs["cpu_model"],
+                "cores": cpu_cores,
+                "frequency": cpu_specs["cpu_frequency"],
+            },
+            "disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
+            "ram_capacity": instance_type.resources.memory_mib * 1024**2,
+            "gpus": [
+                {
+                    "count": len(instance_type.resources.gpus),
+                    "manufacturer": instance_type.resources.gpus[0].vendor,
+                    "model": instance_type.resources.gpus[0].name,
+                }
+            ],
+        }
+    def create_instance(
+        self,
+        instance_offer: InstanceOfferWithAvailability,
+        instance_config: InstanceConfiguration,
+        placement_group: Optional[PlacementGroup],
+    ) -> JobProvisioningData:
+        project_ssh_key = instance_config.ssh_keys[0]
+        self.api_client.upload_ssh_key(project_ssh_key.public)
+        vm_payload = self.get_payload_from_offer(instance_offer.instance)
+        vm_data = self.api_client.create_virtual_machine(vm_payload)
+        return JobProvisioningData(
+            backend=instance_offer.backend,
+            instance_type=instance_offer.instance,
+            instance_id=vm_data["name"],
+            hostname=None,
+            internal_ip=None,
+            region=instance_offer.region,
+            price=instance_offer.price,
+            username="hotaisle",
+            ssh_port=22,
+            dockerized=True,
+            ssh_proxy=None,
+            backend_data=HotAisleInstanceBackendData(
+                ip_address=vm_data["ip_address"], vm_id=vm_data["name"]
+            ).json(),
+        )
+    def update_provisioning_data(
+        self,
+        provisioning_data: JobProvisioningData,
+        project_ssh_public_key: str,
+        project_ssh_private_key: str,
+    ):
+        vm_state = self.api_client.get_vm_state(provisioning_data.instance_id)
+        if vm_state == "running":
+            if provisioning_data.hostname is None and provisioning_data.backend_data:
+                backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data)
+                provisioning_data.hostname = backend_data.ip_address
+            commands = get_shim_commands(
+                authorized_keys=[project_ssh_public_key],
+                arch=provisioning_data.instance_type.resources.cpu_arch,
+            )
+            launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands))
+            thread = Thread(
+                target=_start_runner,
+                kwargs={
+                    "hostname": provisioning_data.hostname,
+                    "project_ssh_private_key": project_ssh_private_key,
+                    "launch_command": launch_command,
+                },
+                daemon=True,
+            )
+            thread.start()
+    def terminate_instance(
+        self, instance_id: str, region: str, backend_data: Optional[str] = None
+    ):
+        vm_name = instance_id
+        self.api_client.terminate_virtual_machine(vm_name)
+def _start_runner(
+    hostname: str,
+    project_ssh_private_key: str,
+    launch_command: str,
+):
+    _launch_runner(
+        hostname=hostname,
+        ssh_private_key=project_ssh_private_key,
+        launch_command=launch_command,
+    )
+def _launch_runner(
+    hostname: str,
+    ssh_private_key: str,
+    launch_command: str,
+):
+    daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
+    _run_ssh_command(
+        hostname=hostname,
+        ssh_private_key=ssh_private_key,
+        command=daemonized_command,
+    )
+def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
+    with tempfile.NamedTemporaryFile("w+", 0o600) as f:
+        f.write(ssh_private_key)
+        f.flush()
+        subprocess.run(
+            [
+                "ssh",
+                "-F",
+                "none",
+                "-o",
+                "StrictHostKeyChecking=no",
+                "-i",
+                f.name,
+                f"hotaisle@{hostname}",
+                command,
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+class HotAisleInstanceBackendData(CoreModel):
+    ip_address: str
+    vm_id: Optional[str] = None
+    @classmethod
+    def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
+        assert raw is not None
+        return cls.__response__.parse_raw(raw)

dstack/_internal/core/backends/hotaisle/configurator.py ADDED Viewed

@@ -0,0 +1,60 @@
+import json
+from dstack._internal.core.backends.base.configurator import (
+    BackendRecord,
+    Configurator,
+)
+from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
+from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend
+from dstack._internal.core.backends.hotaisle.models import (
+    AnyHotAisleBackendConfig,
+    AnyHotAisleCreds,
+    HotAisleBackendConfig,
+    HotAisleBackendConfigWithCreds,
+    HotAisleConfig,
+    HotAisleCreds,
+    HotAisleStoredConfig,
+)
+from dstack._internal.core.models.backends.base import (
+    BackendType,
+)
+class HotAisleConfigurator(Configurator):
+    TYPE = BackendType.HOTAISLE
+    BACKEND_CLASS = HotAisleBackend
+    def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool):
+        self._validate_creds(config.creds, config.team_handle)
+    def create_backend(
+        self, project_name: str, config: HotAisleBackendConfigWithCreds
+    ) -> BackendRecord:
+        return BackendRecord(
+            config=HotAisleStoredConfig(
+                **HotAisleBackendConfig.__response__.parse_obj(config).dict()
+            ).json(),
+            auth=HotAisleCreds.parse_obj(config.creds).json(),
+        )
+    def get_backend_config(
+        self, record: BackendRecord, include_creds: bool
+    ) -> AnyHotAisleBackendConfig:
+        config = self._get_config(record)
+        if include_creds:
+            return HotAisleBackendConfigWithCreds.__response__.parse_obj(config)
+        return HotAisleBackendConfig.__response__.parse_obj(config)
+    def get_backend(self, record: BackendRecord) -> HotAisleBackend:
+        config = self._get_config(record)
+        return HotAisleBackend(config=config)
+    def _get_config(self, record: BackendRecord) -> HotAisleConfig:
+        return HotAisleConfig.__response__(
+            **json.loads(record.config),
+            creds=HotAisleCreds.parse_raw(record.auth),
+        )
+    def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str):
+        api_client = HotAisleAPIClient(creds.api_key, team_handle)
+        api_client.validate_api_key()

dstack/_internal/core/backends/hotaisle/models.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Annotated, List, Literal, Optional, Union
+from pydantic import Field
+from dstack._internal.core.models.common import CoreModel
+class HotAisleAPIKeyCreds(CoreModel):
+    type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
+    api_key: Annotated[str, Field(description="The Hot Aisle API key")]
+AnyHotAisleCreds = HotAisleAPIKeyCreds
+HotAisleCreds = AnyHotAisleCreds
+class HotAisleBackendConfig(CoreModel):
+    type: Annotated[
+        Literal["hotaisle"],
+        Field(description="The type of backend"),
+    ] = "hotaisle"
+    team_handle: Annotated[str, Field(description="The Hot Aisle team handle")]
+    regions: Annotated[
+        Optional[List[str]],
+        Field(description="The list of Hot Aisle regions. Omit to use all regions"),
+    ] = None
+class HotAisleBackendConfigWithCreds(HotAisleBackendConfig):
+    creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
+AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds]
+class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig):
+    creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
+class HotAisleStoredConfig(HotAisleBackendConfig):
+    pass
+class HotAisleConfig(HotAisleStoredConfig):
+    creds: AnyHotAisleCreds

dstack/_internal/core/backends/lambdalabs/compute.py CHANGED Viewed

@@ -206,10 +206,11 @@ def _launch_runner(
     ssh_private_key: str,
     launch_command: str,
 ):
+    daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
     _run_ssh_command(
         hostname=hostname,
         ssh_private_key=ssh_private_key,
-        command=launch_command,
+        command=daemonized_command,
     )

dstack/_internal/core/backends/models.py CHANGED Viewed

@@ -29,6 +29,11 @@ from dstack._internal.core.backends.gcp.models import (
     GCPBackendConfigWithCreds,
     GCPBackendFileConfigWithCreds,
 )
+from dstack._internal.core.backends.hotaisle.models import (
+    HotAisleBackendConfig,
+    HotAisleBackendConfigWithCreds,
+    HotAisleBackendFileConfigWithCreds,
+)
 from dstack._internal.core.backends.kubernetes.models import (
     KubernetesBackendConfig,
     KubernetesBackendConfigWithCreds,
@@ -73,6 +78,7 @@ AnyBackendConfigWithoutCreds = Union[
     CudoBackendConfig,
     DataCrunchBackendConfig,
     GCPBackendConfig,
+    HotAisleBackendConfig,
     KubernetesBackendConfig,
     LambdaBackendConfig,
     NebiusBackendConfig,
@@ -95,6 +101,7 @@ AnyBackendConfigWithCreds = Union[
     CudoBackendConfigWithCreds,
     DataCrunchBackendConfigWithCreds,
     GCPBackendConfigWithCreds,
+    HotAisleBackendConfigWithCreds,
     KubernetesBackendConfigWithCreds,
     LambdaBackendConfigWithCreds,
     OCIBackendConfigWithCreds,
@@ -116,6 +123,7 @@ AnyBackendFileConfigWithCreds = Union[
     CudoBackendConfigWithCreds,
     DataCrunchBackendConfigWithCreds,
     GCPBackendFileConfigWithCreds,
+    HotAisleBackendFileConfigWithCreds,
     KubernetesBackendFileConfigWithCreds,
     LambdaBackendConfigWithCreds,
     OCIBackendConfigWithCreds,

dstack/_internal/core/compatibility/fleets.py CHANGED Viewed

@@ -57,6 +57,8 @@ def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDic
         profile_excludes.add("startup_order")
     if profile.stop_criteria is None:
         profile_excludes.add("stop_criteria")
+    if profile.schedule is None:
+        profile_excludes.add("schedule")
     if configuration_excludes:
         spec_excludes["configuration"] = configuration_excludes
     if profile_excludes:

dstack/_internal/core/compatibility/runs.py CHANGED Viewed

@@ -53,6 +53,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
             job_submissions_excludes["exit_status"] = True
         if all(js.deployment_num == 0 for js in job_submissions):
             job_submissions_excludes["deployment_num"] = True
+        if all(not js.probes for js in job_submissions):
+            job_submissions_excludes["probes"] = True
         latest_job_submission = current_resource.latest_job_submission
         if latest_job_submission is not None:
             latest_job_submission_excludes: IncludeExcludeDictType = {}
@@ -69,6 +71,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
                 latest_job_submission_excludes["exit_status"] = True
             if latest_job_submission.deployment_num == 0:
                 latest_job_submission_excludes["deployment_num"] = True
+            if not latest_job_submission.probes:
+                latest_job_submission_excludes["probes"] = True
     return {"plan": apply_plan_excludes}
@@ -120,12 +124,18 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
         profile_excludes.add("startup_order")
     if configuration.stop_criteria is None:
         configuration_excludes["stop_criteria"] = True
+    if isinstance(configuration, ServiceConfiguration) and not configuration.probes:
+        configuration_excludes["probes"] = True
     if profile is not None and profile.stop_criteria is None:
         profile_excludes.add("stop_criteria")
     if not configuration.files:
         configuration_excludes["files"] = True
     if not run_spec.file_archives:
         spec_excludes["file_archives"] = True
+    if configuration.schedule is None:
+        configuration_excludes["schedule"] = True
+    if profile is not None and profile.schedule is None:
+        profile_excludes.add("schedule")
     if configuration_excludes:
         spec_excludes["configuration"] = configuration_excludes
@@ -150,6 +160,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
         spec_excludes["file_archives"] = True
     if all(s.service_port is None for s in job_specs):
         spec_excludes["service_port"] = True
+    if all(not s.probes for s in job_specs):
+        spec_excludes["probes"] = True
     return spec_excludes

dstack/_internal/core/models/backends/base.py CHANGED Viewed

@@ -11,6 +11,7 @@ class BackendType(str, enum.Enum):
         DSTACK (BackendType): dstack Sky
         GCP (BackendType): Google Cloud Platform
         DATACRUNCH (BackendType): DataCrunch
+        HOTAISLE (BackendType): Hot Aisle
         KUBERNETES (BackendType): Kubernetes
         LAMBDA (BackendType): Lambda Cloud
         NEBIUS (BackendType): Nebius AI Cloud
@@ -28,6 +29,7 @@ class BackendType(str, enum.Enum):
     DATACRUNCH = "datacrunch"
     DSTACK = "dstack"
     GCP = "gcp"
+    HOTAISLE = "hotaisle"
     KUBERNETES = "kubernetes"
     LAMBDA = "lambda"
     LOCAL = "local"

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -14,11 +14,12 @@ from dstack._internal.core.models.envs import Env
 from dstack._internal.core.models.files import FilePathMapping
 from dstack._internal.core.models.fleets import FleetConfiguration
 from dstack._internal.core.models.gateways import GatewayConfiguration
-from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
+from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
 from dstack._internal.core.models.resources import Range, ResourcesSpec
 from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
 from dstack._internal.core.models.unix import UnixUser
 from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
+from dstack._internal.utils.common import has_duplicates
 from dstack._internal.utils.json_utils import (
     pydantic_orjson_dumps_with_indent,
 )
@@ -32,6 +33,14 @@ RUN_PRIOTIRY_MIN = 0
 RUN_PRIOTIRY_MAX = 100
 RUN_PRIORITY_DEFAULT = 0
 DEFAULT_REPO_DIR = "/workflow"
+MIN_PROBE_TIMEOUT = 1
+MIN_PROBE_INTERVAL = 1
+DEFAULT_PROBE_URL = "/"
+DEFAULT_PROBE_TIMEOUT = 10
+DEFAULT_PROBE_INTERVAL = 15
+DEFAULT_PROBE_READY_AFTER = 1
+DEFAULT_PROBE_METHOD = "get"
+MAX_PROBE_URL_LEN = 2048
 class RunConfigurationType(str, Enum):
@@ -162,6 +171,121 @@ class RateLimit(CoreModel):
     ] = 0
+HTTPMethod = Literal["get", "post", "put", "delete", "patch", "head"]
+class HTTPHeaderSpec(CoreModel):
+    name: Annotated[
+        str,
+        Field(
+            description="The name of the HTTP header",
+            min_length=1,
+            max_length=256,
+        ),
+    ]
+    value: Annotated[
+        str,
+        Field(
+            description="The value of the HTTP header",
+            min_length=1,
+            max_length=2048,
+        ),
+    ]
+class ProbeConfig(CoreModel):
+    type: Literal["http"]  # expect other probe types in the future, namely `exec`
+    url: Annotated[
+        Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`")
+    ] = None
+    method: Annotated[
+        Optional[HTTPMethod],
+        Field(
+            description=(
+                "The HTTP method to use for the probe (e.g., `get`, `post`, etc.)."
+                f" Defaults to `{DEFAULT_PROBE_METHOD}`"
+            )
+        ),
+    ] = None
+    headers: Annotated[
+        list[HTTPHeaderSpec],
+        Field(description="A list of HTTP headers to include in the request", max_items=16),
+    ] = []
+    body: Annotated[
+        Optional[str],
+        Field(
+            description="The HTTP request body to send with the probe",
+            min_length=1,
+            max_length=2048,
+        ),
+    ] = None
+    timeout: Annotated[
+        Optional[Union[int, str]],
+        Field(
+            description=(
+                f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`"
+            )
+        ),
+    ] = None
+    interval: Annotated[
+        Optional[Union[int, str]],
+        Field(
+            description=(
+                "Minimum amount of time between the end of one probe execution"
+                f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`"
+            )
+        ),
+    ] = None
+    ready_after: Annotated[
+        Optional[int],
+        Field(
+            ge=1,
+            description=(
+                "The number of consecutive successful probe executions required for the replica"
+                " to be considered ready. Used during rolling deployments."
+                f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`"
+            ),
+        ),
+    ] = None
+    @validator("timeout")
+    def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]:
+        if v is None:
+            return v
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_TIMEOUT:
+            raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
+        return parsed
+    @validator("interval")
+    def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]:
+        if v is None:
+            return v
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_INTERVAL:
+            raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
+        return parsed
+    @validator("url")
+    def validate_url(cls, v: Optional[str]) -> Optional[str]:
+        if v is None:
+            return v
+        if not v.startswith("/"):
+            raise ValueError("Must start with `/`")
+        if len(v) > MAX_PROBE_URL_LEN:
+            raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters")
+        if not v.isprintable():
+            raise ValueError("Cannot contain non-printable characters")
+        return v
+    @root_validator
+    def validate_body_matches_method(cls, values):
+        method: HTTPMethod = values["method"]
+        if values["body"] is not None and method in ["get", "head"]:
+            raise ValueError(f"Cannot set request body for the `{method}` method")
+        return values
 class BaseRunConfiguration(CoreModel):
     type: Literal["none"]
     name: Annotated[
@@ -448,6 +572,10 @@ class ServiceConfigurationParams(CoreModel):
         Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
     ] = None
     rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
+    probes: Annotated[
+        list[ProbeConfig],
+        Field(description="List of probes used to determine job health"),
+    ] = []
     @validator("port")
     def convert_port(cls, v) -> PortMapping:
@@ -511,6 +639,16 @@ class ServiceConfigurationParams(CoreModel):
             )
         return v
+    @validator("probes")
+    def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
+        if has_duplicates(v):
+            # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
+            # https://github.com/pydantic/pydantic/issues/3765
+            # Because of the bug, our gen_schema_reference.py fails to determine the type of
+            # ServiceConfiguration.probes and insert the correct hyperlink.
+            raise ValueError("Probes must be unique")
+        return v
 class ServiceConfiguration(
     ProfileParams, BaseRunConfigurationWithCommands, ServiceConfigurationParams

dstack/_internal/core/models/health.py ADDED Viewed

@@ -0,0 +1,28 @@
+from datetime import datetime
+from enum import Enum
+from dstack._internal.core.models.common import CoreModel
+class HealthStatus(str, Enum):
+    HEALTHY = "healthy"
+    WARNING = "warning"
+    FAILURE = "failure"
+    def is_healthy(self) -> bool:
+        return self == self.HEALTHY
+    def is_failure(self) -> bool:
+        return self == self.FAILURE
+class HealthEvent(CoreModel):
+    timestamp: datetime
+    status: HealthStatus
+    message: str
+class HealthCheck(CoreModel):
+    collected_at: datetime
+    status: HealthStatus
+    events: list[HealthEvent]

dstack/_internal/core/models/instances.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pydantic import root_validator
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import CoreModel
 from dstack._internal.core.models.envs import Env
+from dstack._internal.core.models.health import HealthStatus
 from dstack._internal.core.models.volumes import Volume
 from dstack._internal.utils.common import pretty_resources
@@ -225,6 +226,7 @@ class Instance(CoreModel):
     hostname: Optional[str] = None
     status: InstanceStatus
     unreachable: bool = False
+    health_status: HealthStatus = HealthStatus.HEALTHY
     termination_reason: Optional[str] = None
     created: datetime.datetime
     region: Optional[str] = None

dstack/_internal/core/models/logs.py CHANGED Viewed

@@ -23,4 +23,5 @@ class LogEvent(CoreModel):
 class JobSubmissionLogs(CoreModel):
     logs: List[LogEvent]
-    next_token: Optional[str]
+    external_url: Optional[str] = None
+    next_token: Optional[str] = None

dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl

Potentially problematic release.

dstack 0.19.20py3-none-any.whl → 0.19.22py3-none-any.whl