PyPI - dstack - Versions diffs - 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl - Mend

dstack 0.19.21py3-none-any.whl → 0.19.23rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (71) hide show

dstack/_internal/core/backends/hotaisle/configurator.py ADDED Viewed

@@ -0,0 +1,60 @@
+import json
+from dstack._internal.core.backends.base.configurator import (
+    BackendRecord,
+    Configurator,
+)
+from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
+from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend
+from dstack._internal.core.backends.hotaisle.models import (
+    AnyHotAisleBackendConfig,
+    AnyHotAisleCreds,
+    HotAisleBackendConfig,
+    HotAisleBackendConfigWithCreds,
+    HotAisleConfig,
+    HotAisleCreds,
+    HotAisleStoredConfig,
+)
+from dstack._internal.core.models.backends.base import (
+    BackendType,
+)
+class HotAisleConfigurator(Configurator):
+    TYPE = BackendType.HOTAISLE
+    BACKEND_CLASS = HotAisleBackend
+    def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool):
+        self._validate_creds(config.creds, config.team_handle)
+    def create_backend(
+        self, project_name: str, config: HotAisleBackendConfigWithCreds
+    ) -> BackendRecord:
+        return BackendRecord(
+            config=HotAisleStoredConfig(
+                **HotAisleBackendConfig.__response__.parse_obj(config).dict()
+            ).json(),
+            auth=HotAisleCreds.parse_obj(config.creds).json(),
+        )
+    def get_backend_config(
+        self, record: BackendRecord, include_creds: bool
+    ) -> AnyHotAisleBackendConfig:
+        config = self._get_config(record)
+        if include_creds:
+            return HotAisleBackendConfigWithCreds.__response__.parse_obj(config)
+        return HotAisleBackendConfig.__response__.parse_obj(config)
+    def get_backend(self, record: BackendRecord) -> HotAisleBackend:
+        config = self._get_config(record)
+        return HotAisleBackend(config=config)
+    def _get_config(self, record: BackendRecord) -> HotAisleConfig:
+        return HotAisleConfig.__response__(
+            **json.loads(record.config),
+            creds=HotAisleCreds.parse_raw(record.auth),
+        )
+    def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str):
+        api_client = HotAisleAPIClient(creds.api_key, team_handle)
+        api_client.validate_api_key()

dstack/_internal/core/backends/hotaisle/models.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Annotated, List, Literal, Optional, Union
+from pydantic import Field
+from dstack._internal.core.models.common import CoreModel
+class HotAisleAPIKeyCreds(CoreModel):
+    type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
+    api_key: Annotated[str, Field(description="The Hot Aisle API key")]
+AnyHotAisleCreds = HotAisleAPIKeyCreds
+HotAisleCreds = AnyHotAisleCreds
+class HotAisleBackendConfig(CoreModel):
+    type: Annotated[
+        Literal["hotaisle"],
+        Field(description="The type of backend"),
+    ] = "hotaisle"
+    team_handle: Annotated[str, Field(description="The Hot Aisle team handle")]
+    regions: Annotated[
+        Optional[List[str]],
+        Field(description="The list of Hot Aisle regions. Omit to use all regions"),
+    ] = None
+class HotAisleBackendConfigWithCreds(HotAisleBackendConfig):
+    creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
+AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds]
+class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig):
+    creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
+class HotAisleStoredConfig(HotAisleBackendConfig):
+    pass
+class HotAisleConfig(HotAisleStoredConfig):
+    creds: AnyHotAisleCreds

dstack/_internal/core/backends/lambdalabs/compute.py CHANGED Viewed

@@ -206,10 +206,11 @@ def _launch_runner(
     ssh_private_key: str,
     launch_command: str,
 ):
+    daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
     _run_ssh_command(
         hostname=hostname,
         ssh_private_key=ssh_private_key,
-        command=launch_command,
+        command=daemonized_command,
     )

dstack/_internal/core/backends/models.py CHANGED Viewed

@@ -29,6 +29,11 @@ from dstack._internal.core.backends.gcp.models import (
     GCPBackendConfigWithCreds,
     GCPBackendFileConfigWithCreds,
 )
+from dstack._internal.core.backends.hotaisle.models import (
+    HotAisleBackendConfig,
+    HotAisleBackendConfigWithCreds,
+    HotAisleBackendFileConfigWithCreds,
+)
 from dstack._internal.core.backends.kubernetes.models import (
     KubernetesBackendConfig,
     KubernetesBackendConfigWithCreds,
@@ -73,6 +78,7 @@ AnyBackendConfigWithoutCreds = Union[
     CudoBackendConfig,
     DataCrunchBackendConfig,
     GCPBackendConfig,
+    HotAisleBackendConfig,
     KubernetesBackendConfig,
     LambdaBackendConfig,
     NebiusBackendConfig,
@@ -95,6 +101,7 @@ AnyBackendConfigWithCreds = Union[
     CudoBackendConfigWithCreds,
     DataCrunchBackendConfigWithCreds,
     GCPBackendConfigWithCreds,
+    HotAisleBackendConfigWithCreds,
     KubernetesBackendConfigWithCreds,
     LambdaBackendConfigWithCreds,
     OCIBackendConfigWithCreds,
@@ -116,6 +123,7 @@ AnyBackendFileConfigWithCreds = Union[
     CudoBackendConfigWithCreds,
     DataCrunchBackendConfigWithCreds,
     GCPBackendFileConfigWithCreds,
+    HotAisleBackendFileConfigWithCreds,
     KubernetesBackendFileConfigWithCreds,
     LambdaBackendConfigWithCreds,
     OCIBackendConfigWithCreds,

dstack/_internal/core/backends/nebius/compute.py CHANGED Viewed

@@ -74,6 +74,7 @@ SETUP_COMMANDS = [
 SUPPORTED_PLATFORMS = [
     "gpu-h100-sxm",
     "gpu-h200-sxm",
+    "gpu-b200-sxm",
     "gpu-l40s-a",
     "gpu-l40s-d",
     "cpu-d3",
@@ -150,12 +151,16 @@ class NebiusCompute(
             )
             if backend_data.cluster is not None:
                 cluster_id = backend_data.cluster.id
+        gpus = instance_offer.instance.resources.gpus
         create_disk_op = resources.create_disk(
             sdk=self._sdk,
             name=instance_name,
             project_id=self._region_to_project_id[instance_offer.region],
             size_mib=instance_offer.instance.resources.disk.size_mib,
-            image_family="ubuntu22.04-cuda12",
+            image_family="ubuntu24.04-cuda12"
+            if gpus and gpus[0].name == "B200"
+            else "ubuntu22.04-cuda12",
         )
         create_instance_op = None
         try:
@@ -180,6 +185,7 @@ class NebiusCompute(
                 cluster_id=cluster_id,
                 disk_id=create_disk_op.resource_id,
                 subnet_id=self._get_subnet_id(instance_offer.region),
+                preemptible=instance_offer.instance.resources.spot,
             )
             _wait_for_instance(self._sdk, create_instance_op)
         except BaseException:
@@ -367,4 +373,4 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
 def _supported_instances(offer: InstanceOffer) -> bool:
     platform, _ = offer.instance.name.split()
-    return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
+    return platform in SUPPORTED_PLATFORMS

dstack/_internal/core/backends/nebius/fabrics.py CHANGED Viewed

@@ -21,6 +21,7 @@ INFINIBAND_FABRICS = [
     InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
     InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
     InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
+    InfinibandFabric("us-central1-b", "gpu-b200-sxm", "us-central1"),
 ]

dstack/_internal/core/backends/nebius/resources.py CHANGED Viewed

@@ -28,10 +28,12 @@ from nebius.api.nebius.compute.v1 import (
     GpuClusterSpec,
     Instance,
     InstanceGpuClusterSpec,
+    InstanceRecoveryPolicy,
     InstanceServiceClient,
     InstanceSpec,
     IPAddress,
     NetworkInterfaceSpec,
+    PreemptibleSpec,
     PublicIPAddress,
     ResourcesSpec,
     SourceImageFamily,
@@ -283,6 +285,7 @@ def create_instance(
     cluster_id: Optional[str],
     disk_id: str,
     subnet_id: str,
+    preemptible: bool,
 ) -> SDKOperation[Operation]:
     client = InstanceServiceClient(sdk)
     request = CreateInstanceRequest(
@@ -306,6 +309,12 @@ def create_instance(
                     public_ip_address=PublicIPAddress(static=True),
                 )
             ],
+            preemptible=PreemptibleSpec(
+                priority=1, on_preemption=PreemptibleSpec.PreemptionPolicy.STOP
+            )
+            if preemptible
+            else None,
+            recovery_policy=InstanceRecoveryPolicy.FAIL if preemptible else None,
         ),
     )
     with wrap_capacity_errors():

dstack/_internal/core/compatibility/runs.py CHANGED Viewed

@@ -53,6 +53,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
             job_submissions_excludes["exit_status"] = True
         if all(js.deployment_num == 0 for js in job_submissions):
             job_submissions_excludes["deployment_num"] = True
+        if all(not js.probes for js in job_submissions):
+            job_submissions_excludes["probes"] = True
         latest_job_submission = current_resource.latest_job_submission
         if latest_job_submission is not None:
             latest_job_submission_excludes: IncludeExcludeDictType = {}
@@ -69,6 +71,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
                 latest_job_submission_excludes["exit_status"] = True
             if latest_job_submission.deployment_num == 0:
                 latest_job_submission_excludes["deployment_num"] = True
+            if not latest_job_submission.probes:
+                latest_job_submission_excludes["probes"] = True
     return {"plan": apply_plan_excludes}
@@ -120,6 +124,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
         profile_excludes.add("startup_order")
     if configuration.stop_criteria is None:
         configuration_excludes["stop_criteria"] = True
+    if isinstance(configuration, ServiceConfiguration) and not configuration.probes:
+        configuration_excludes["probes"] = True
     if profile is not None and profile.stop_criteria is None:
         profile_excludes.add("stop_criteria")
     if not configuration.files:
@@ -154,6 +160,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
         spec_excludes["file_archives"] = True
     if all(s.service_port is None for s in job_specs):
         spec_excludes["service_port"] = True
+    if all(not s.probes for s in job_specs):
+        spec_excludes["probes"] = True
     return spec_excludes

dstack/_internal/core/models/backends/base.py CHANGED Viewed

@@ -11,6 +11,7 @@ class BackendType(str, enum.Enum):
         DSTACK (BackendType): dstack Sky
         GCP (BackendType): Google Cloud Platform
         DATACRUNCH (BackendType): DataCrunch
+        HOTAISLE (BackendType): Hot Aisle
         KUBERNETES (BackendType): Kubernetes
         LAMBDA (BackendType): Lambda Cloud
         NEBIUS (BackendType): Nebius AI Cloud
@@ -28,6 +29,7 @@ class BackendType(str, enum.Enum):
     DATACRUNCH = "datacrunch"
     DSTACK = "dstack"
     GCP = "gcp"
+    HOTAISLE = "hotaisle"
     KUBERNETES = "kubernetes"
     LAMBDA = "lambda"
     LOCAL = "local"

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -14,11 +14,12 @@ from dstack._internal.core.models.envs import Env
 from dstack._internal.core.models.files import FilePathMapping
 from dstack._internal.core.models.fleets import FleetConfiguration
 from dstack._internal.core.models.gateways import GatewayConfiguration
-from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
+from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
 from dstack._internal.core.models.resources import Range, ResourcesSpec
 from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
 from dstack._internal.core.models.unix import UnixUser
 from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
+from dstack._internal.utils.common import has_duplicates
 from dstack._internal.utils.json_utils import (
     pydantic_orjson_dumps_with_indent,
 )
@@ -32,6 +33,14 @@ RUN_PRIOTIRY_MIN = 0
 RUN_PRIOTIRY_MAX = 100
 RUN_PRIORITY_DEFAULT = 0
 DEFAULT_REPO_DIR = "/workflow"
+MIN_PROBE_TIMEOUT = 1
+MIN_PROBE_INTERVAL = 1
+DEFAULT_PROBE_URL = "/"
+DEFAULT_PROBE_TIMEOUT = 10
+DEFAULT_PROBE_INTERVAL = 15
+DEFAULT_PROBE_READY_AFTER = 1
+DEFAULT_PROBE_METHOD = "get"
+MAX_PROBE_URL_LEN = 2048
 class RunConfigurationType(str, Enum):
@@ -162,6 +171,121 @@ class RateLimit(CoreModel):
     ] = 0
+HTTPMethod = Literal["get", "post", "put", "delete", "patch", "head"]
+class HTTPHeaderSpec(CoreModel):
+    name: Annotated[
+        str,
+        Field(
+            description="The name of the HTTP header",
+            min_length=1,
+            max_length=256,
+        ),
+    ]
+    value: Annotated[
+        str,
+        Field(
+            description="The value of the HTTP header",
+            min_length=1,
+            max_length=2048,
+        ),
+    ]
+class ProbeConfig(CoreModel):
+    type: Literal["http"]  # expect other probe types in the future, namely `exec`
+    url: Annotated[
+        Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`")
+    ] = None
+    method: Annotated[
+        Optional[HTTPMethod],
+        Field(
+            description=(
+                "The HTTP method to use for the probe (e.g., `get`, `post`, etc.)."
+                f" Defaults to `{DEFAULT_PROBE_METHOD}`"
+            )
+        ),
+    ] = None
+    headers: Annotated[
+        list[HTTPHeaderSpec],
+        Field(description="A list of HTTP headers to include in the request", max_items=16),
+    ] = []
+    body: Annotated[
+        Optional[str],
+        Field(
+            description="The HTTP request body to send with the probe",
+            min_length=1,
+            max_length=2048,
+        ),
+    ] = None
+    timeout: Annotated[
+        Optional[Union[int, str]],
+        Field(
+            description=(
+                f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`"
+            )
+        ),
+    ] = None
+    interval: Annotated[
+        Optional[Union[int, str]],
+        Field(
+            description=(
+                "Minimum amount of time between the end of one probe execution"
+                f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`"
+            )
+        ),
+    ] = None
+    ready_after: Annotated[
+        Optional[int],
+        Field(
+            ge=1,
+            description=(
+                "The number of consecutive successful probe executions required for the replica"
+                " to be considered ready. Used during rolling deployments."
+                f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`"
+            ),
+        ),
+    ] = None
+    @validator("timeout")
+    def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]:
+        if v is None:
+            return v
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_TIMEOUT:
+            raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
+        return parsed
+    @validator("interval")
+    def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]:
+        if v is None:
+            return v
+        parsed = parse_duration(v)
+        if parsed < MIN_PROBE_INTERVAL:
+            raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
+        return parsed
+    @validator("url")
+    def validate_url(cls, v: Optional[str]) -> Optional[str]:
+        if v is None:
+            return v
+        if not v.startswith("/"):
+            raise ValueError("Must start with `/`")
+        if len(v) > MAX_PROBE_URL_LEN:
+            raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters")
+        if not v.isprintable():
+            raise ValueError("Cannot contain non-printable characters")
+        return v
+    @root_validator
+    def validate_body_matches_method(cls, values):
+        method: HTTPMethod = values["method"]
+        if values["body"] is not None and method in ["get", "head"]:
+            raise ValueError(f"Cannot set request body for the `{method}` method")
+        return values
 class BaseRunConfiguration(CoreModel):
     type: Literal["none"]
     name: Annotated[
@@ -448,6 +572,10 @@ class ServiceConfigurationParams(CoreModel):
         Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
     ] = None
     rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
+    probes: Annotated[
+        list[ProbeConfig],
+        Field(description="List of probes used to determine job health"),
+    ] = []
     @validator("port")
     def convert_port(cls, v) -> PortMapping:
@@ -511,6 +639,16 @@ class ServiceConfigurationParams(CoreModel):
             )
         return v
+    @validator("probes")
+    def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
+        if has_duplicates(v):
+            # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
+            # https://github.com/pydantic/pydantic/issues/3765
+            # Because of the bug, our gen_schema_reference.py fails to determine the type of
+            # ServiceConfiguration.probes and insert the correct hyperlink.
+            raise ValueError("Probes must be unique")
+        return v
 class ServiceConfiguration(
     ProfileParams, BaseRunConfigurationWithCommands, ServiceConfigurationParams

dstack/_internal/core/models/health.py ADDED Viewed

@@ -0,0 +1,28 @@
+from datetime import datetime
+from enum import Enum
+from dstack._internal.core.models.common import CoreModel
+class HealthStatus(str, Enum):
+    HEALTHY = "healthy"
+    WARNING = "warning"
+    FAILURE = "failure"
+    def is_healthy(self) -> bool:
+        return self == self.HEALTHY
+    def is_failure(self) -> bool:
+        return self == self.FAILURE
+class HealthEvent(CoreModel):
+    timestamp: datetime
+    status: HealthStatus
+    message: str
+class HealthCheck(CoreModel):
+    collected_at: datetime
+    status: HealthStatus
+    events: list[HealthEvent]

dstack/_internal/core/models/instances.py CHANGED Viewed

@@ -9,6 +9,7 @@ from pydantic import root_validator
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import CoreModel
 from dstack._internal.core.models.envs import Env
+from dstack._internal.core.models.health import HealthStatus
 from dstack._internal.core.models.volumes import Volume
 from dstack._internal.utils.common import pretty_resources
@@ -225,6 +226,7 @@ class Instance(CoreModel):
     hostname: Optional[str] = None
     status: InstanceStatus
     unreachable: bool = False
+    health_status: HealthStatus = HealthStatus.HEALTHY
     termination_reason: Optional[str] = None
     created: datetime.datetime
     region: Optional[str] = None

dstack/_internal/core/models/logs.py CHANGED Viewed

@@ -23,4 +23,5 @@ class LogEvent(CoreModel):
 class JobSubmissionLogs(CoreModel):
     logs: List[LogEvent]
-    next_token: Optional[str]
+    external_url: Optional[str] = None
+    next_token: Optional[str] = None

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from datetime import datetime, timedelta
 from enum import Enum
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Literal, Optional, Type
 from pydantic import UUID4, Field, root_validator
 from typing_extensions import Annotated
@@ -8,8 +8,11 @@ from typing_extensions import Annotated
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
 from dstack._internal.core.models.configurations import (
+    DEFAULT_PROBE_METHOD,
     DEFAULT_REPO_DIR,
     AnyRunConfiguration,
+    HTTPHeaderSpec,
+    HTTPMethod,
     RunConfiguration,
     ServiceConfiguration,
 )
@@ -223,6 +226,17 @@ class JobSSHKey(CoreModel):
     public: str
+class ProbeSpec(CoreModel):
+    type: Literal["http"]  # expect other probe types in the future, namely `exec`
+    url: str
+    method: HTTPMethod = DEFAULT_PROBE_METHOD
+    headers: list[HTTPHeaderSpec] = []
+    body: Optional[str] = None
+    timeout: int
+    interval: int
+    ready_after: int
 class JobSpec(CoreModel):
     replica_num: int = 0  # default value for backward compatibility
     job_num: int
@@ -256,6 +270,7 @@ class JobSpec(CoreModel):
     file_archives: list[FileArchiveMapping] = []
     # None for non-services and pre-0.19.19 services. See `get_service_port`
     service_port: Optional[int] = None
+    probes: list[ProbeSpec] = []
 class JobProvisioningData(CoreModel):
@@ -325,6 +340,10 @@ class ClusterInfo(CoreModel):
     gpus_per_job: int
+class Probe(CoreModel):
+    success_streak: int
 class JobSubmission(CoreModel):
     id: UUID4
     submission_num: int
@@ -341,6 +360,7 @@ class JobSubmission(CoreModel):
     job_provisioning_data: Optional[JobProvisioningData]
     job_runtime_data: Optional[JobRuntimeData]
     error: Optional[str] = None
+    probes: list[Probe] = []
     @property
     def age(self) -> timedelta:

dstack/_internal/core/services/ssh/tunnel.py CHANGED Viewed

@@ -236,6 +236,13 @@ class SSHTunnel:
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
+    async def __aenter__(self):
+        await self.aopen()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.aclose()
     def _get_proxy_command(self) -> Optional[str]:
         proxy_command: Optional[str] = None
         for params, identity_path in self.ssh_proxies:

dstack/_internal/server/app.py CHANGED Viewed

@@ -22,6 +22,7 @@ from dstack._internal.proxy.lib.deps import get_injector_from_app
 from dstack._internal.proxy.lib.routers import model_proxy
 from dstack._internal.server import settings
 from dstack._internal.server.background import start_background_tasks
+from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER
 from dstack._internal.server.db import get_db, get_session_ctx, migrate
 from dstack._internal.server.routers import (
     backends,
@@ -155,6 +156,7 @@ async def lifespan(app: FastAPI):
         scheduler = start_background_tasks()
     else:
         logger.info("Background processing is disabled")
+    PROBES_SCHEDULER.start()
     dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
     logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
     logger.info(
@@ -166,6 +168,7 @@ async def lifespan(app: FastAPI):
     yield
     if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
         scheduler.shutdown()
+    PROBES_SCHEDULER.shutdown(wait=False)
     await gateway_connections_pool.remove_all()
     service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
     await service_conn_pool.remove_all()
@@ -197,6 +200,7 @@ def register_routes(app: FastAPI, ui: bool = True):
     app.include_router(fleets.root_router)
     app.include_router(fleets.project_router)
     app.include_router(instances.root_router)
+    app.include_router(instances.project_router)
     app.include_router(repos.router)
     app.include_router(runs.root_router)
     app.include_router(runs.project_router)

dstack/_internal/server/background/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ from dstack._internal.server.background.tasks.process_gateways import (
 )
 from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
 from dstack._internal.server.background.tasks.process_instances import (
+    delete_instance_health_checks,
     process_instances,
 )
 from dstack._internal.server.background.tasks.process_metrics import (
@@ -18,6 +19,7 @@ from dstack._internal.server.background.tasks.process_metrics import (
 from dstack._internal.server.background.tasks.process_placement_groups import (
     process_placement_groups,
 )
+from dstack._internal.server.background.tasks.process_probes import process_probes
 from dstack._internal.server.background.tasks.process_prometheus_metrics import (
     collect_prometheus_metrics,
     delete_prometheus_metrics,
@@ -63,6 +65,7 @@ def start_background_tasks() -> AsyncIOScheduler:
     # that the first waiting for the lock will acquire it.
     # The jitter is needed to give all tasks a chance to acquire locks.
+    _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1))
     _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
     _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
     if settings.ENABLE_PROMETHEUS_METRICS:
@@ -84,6 +87,7 @@ def start_background_tasks() -> AsyncIOScheduler:
         IntervalTrigger(seconds=10, jitter=2),
         max_instances=1,
     )
+    _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1)
     for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
         # Add multiple copies of tasks if requested.
         # max_instances=1 for additional copies to avoid running too many tasks.

dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl

Potentially problematic release.

dstack 0.19.21py3-none-any.whl → 0.19.23rc1py3-none-any.whl