PyPI - dstack - Versions diffs - 0.19.8__py3-none-any.whl → 0.19.10__py3-none-any.whl - Mend

dstack 0.19.8py3-none-any.whl → 0.19.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (42) hide show

dstack/_internal/cli/commands/config.py CHANGED Viewed

@@ -14,7 +14,7 @@ logger = get_logger(__name__)
 class ConfigCommand(BaseCommand):
     NAME = "config"
-    DESCRIPTION = "Configure CLI"
+    DESCRIPTION = "Configure CLI (deprecated; use `dstack project`)"
     def _register(self):
         super()._register()

dstack/_internal/cli/commands/project.py ADDED Viewed

@@ -0,0 +1,161 @@
+import argparse
+from requests import HTTPError
+from rich.table import Table
+import dstack.api.server
+from dstack._internal.cli.commands import BaseCommand
+from dstack._internal.cli.utils.common import confirm_ask, console
+from dstack._internal.core.errors import ClientError, CLIError
+from dstack._internal.core.services.configs import ConfigManager
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
+class ProjectCommand(BaseCommand):
+    NAME = "project"
+    DESCRIPTION = "Manage projects configs"
+    def _register(self):
+        super()._register()
+        subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
+        # Add subcommand
+        add_parser = subparsers.add_parser("add", help="Add or update a project config")
+        add_parser.add_argument(
+            "--name", type=str, help="The name of the project to configure", required=True
+        )
+        add_parser.add_argument("--url", type=str, help="Server url", required=True)
+        add_parser.add_argument("--token", type=str, help="User token", required=True)
+        add_parser.add_argument(
+            "-y",
+            "--yes",
+            help="Don't ask for confirmation (e.g. update the config)",
+            action="store_true",
+        )
+        add_parser.add_argument(
+            "-n",
+            "--no",
+            help="Don't ask for confirmation (e.g. do not update the config)",
+            action="store_true",
+        )
+        add_parser.set_defaults(subfunc=self._add)
+        # Delete subcommand
+        delete_parser = subparsers.add_parser("delete", help="Delete a project config")
+        delete_parser.add_argument(
+            "--name", type=str, help="The name of the project to delete", required=True
+        )
+        delete_parser.add_argument(
+            "-y",
+            "--yes",
+            help="Don't ask for confirmation",
+            action="store_true",
+        )
+        delete_parser.set_defaults(subfunc=self._delete)
+        # List subcommand
+        list_parser = subparsers.add_parser("list", help="List configured projects")
+        list_parser.set_defaults(subfunc=self._list)
+        # Set default subcommand
+        set_default_parser = subparsers.add_parser("set-default", help="Set default project")
+        set_default_parser.add_argument(
+            "name", type=str, help="The name of the project to set as default"
+        )
+        set_default_parser.set_defaults(subfunc=self._set_default)
+    def _command(self, args: argparse.Namespace):
+        if not hasattr(args, "subfunc"):
+            args.subfunc = self._list
+        args.subfunc(args)
+    def _add(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
+        try:
+            api_client.projects.get(args.name)
+        except HTTPError as e:
+            if e.response.status_code == 403:
+                raise CLIError("Forbidden. Ensure the token is valid.")
+            elif e.response.status_code == 404:
+                raise CLIError(f"Project '{args.name}' not found.")
+            else:
+                raise e
+        default_project = config_manager.get_project_config()
+        if (
+            default_project is None
+            or default_project.name != args.name
+            or default_project.url != args.url
+            or default_project.token != args.token
+        ):
+            set_it_as_default = (
+                (
+                    args.yes
+                    or not default_project
+                    or confirm_ask(f"Set '{args.name}' as your default project?")
+                )
+                if not args.no
+                else False
+            )
+            config_manager.configure_project(
+                name=args.name, url=args.url, token=args.token, default=set_it_as_default
+            )
+            config_manager.save()
+        logger.info(
+            f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
+        )
+    def _delete(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
+            config_manager.delete_project(args.name)
+            config_manager.save()
+            console.print("[grey58]OK[/]")
+    def _list(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        default_project = config_manager.get_project_config()
+        table = Table(box=None)
+        table.add_column("PROJECT", style="bold", no_wrap=True)
+        table.add_column("URL", style="grey58")
+        table.add_column("USER", style="grey58")
+        table.add_column("DEFAULT", justify="center")
+        for project_name in config_manager.list_projects():
+            project_config = config_manager.get_project_config(project_name)
+            is_default = project_name == default_project.name if default_project else False
+            # Get username from API
+            try:
+                api_client = dstack.api.server.APIClient(
+                    base_url=project_config.url, token=project_config.token
+                )
+                user_info = api_client.users.get_my_user()
+                username = user_info.username
+            except ClientError:
+                username = "(invalid token)"
+            table.add_row(
+                project_name,
+                project_config.url,
+                username,
+                "✓" if is_default else "",
+                style="bold" if is_default else None,
+            )
+        console.print(table)
+    def _set_default(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        project_config = config_manager.get_project_config(args.name)
+        if project_config is None:
+            raise CLIError(f"Project '{args.name}' not found")
+        config_manager.configure_project(
+            name=args.name, url=project_config.url, token=project_config.token, default=True
+        )
+        config_manager.save()
+        console.print("[grey58]OK[/]")

dstack/_internal/cli/commands/ps.py CHANGED Viewed

@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
             help="Watch statuses of runs in realtime",
             action="store_true",
         )
+        self._parser.add_argument(
+            "-n",
+            "--last",
+            help="Show only the last N runs. Implies --all",
+            type=int,
+            default=None,
+        )
     def _command(self, args: argparse.Namespace):
         super()._command(args)
-        runs = self.api.runs.list(all=args.all)
+        runs = self.api.runs.list(all=args.all, limit=args.last)
         if not args.watch:
             console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
             return
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
                 while True:
                     live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
                     time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
-                    runs = self.api.runs.list(all=args.all)
+                    runs = self.api.runs.list(all=args.all, limit=args.last)
         except KeyboardInterrupt:
             pass

dstack/_internal/cli/main.py CHANGED Viewed

@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
 from dstack._internal.cli.commands.logs import LogsCommand
 from dstack._internal.cli.commands.metrics import MetricsCommand
 from dstack._internal.cli.commands.offer import OfferCommand
+from dstack._internal.cli.commands.project import ProjectCommand
 from dstack._internal.cli.commands.ps import PsCommand
 from dstack._internal.cli.commands.server import ServerCommand
 from dstack._internal.cli.commands.stats import StatsCommand
@@ -69,6 +70,7 @@ def main():
     OfferCommand.register(subparsers)
     LogsCommand.register(subparsers)
     MetricsCommand.register(subparsers)
+    ProjectCommand.register(subparsers)
     PsCommand.register(subparsers)
     ServerCommand.register(subparsers)
     StatsCommand.register(subparsers)

dstack/_internal/cli/services/configurators/run.py CHANGED Viewed

@@ -98,6 +98,8 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
         print_run_plan(run_plan, max_offers=configurator_args.max_offers)
         confirm_message = "Submit a new run?"
+        if conf.name:
+            confirm_message = f"Submit the run [code]{conf.name}[/]?"
         stop_run_name = None
         if run_plan.current_resource is not None:
             changed_fields = []
@@ -130,11 +132,6 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
                     f"Active run [code]{conf.name}[/] already exists and cannot be updated in-place."
                 )
                 confirm_message = "Stop and override the run?"
-            else:
-                console.print(f"Finished run [code]{conf.name}[/] already exists.")
-                confirm_message = "Override the run?"
-        elif conf.name:
-            confirm_message = f"Submit the run [code]{conf.name}[/]?"
         if not command_args.yes and not confirm_ask(confirm_message):
             console.print("\nExiting...")
@@ -560,7 +557,9 @@ def print_finished_message(run: Run):
         console.print("[code]Done[/]")
         return
-    termination_reason, termination_reason_message = _get_run_termination_reason(run)
+    termination_reason, termination_reason_message, exit_status = (
+        _get_run_termination_reason_and_exit_status(run)
+    )
     message = "Run failed due to unknown reason. Check CLI, server, and run logs."
     if run.status == RunStatus.TERMINATED:
         message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
@@ -572,13 +571,15 @@ def print_finished_message(run: Run):
             "Check CLI and server logs for more details."
         )
     elif termination_reason is not None:
+        exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
         error_details = (
             f"Error: {termination_reason_message}\n" if termination_reason_message else ""
         )
         message = (
             f"Run failed with error code {termination_reason.name}.\n"
+            f"{exit_status_details}"
             f"{error_details}"
-            "Check CLI, server, and run logs for more details."
+            f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
         )
     console.print(f"[error]{message}[/]")
@@ -589,14 +590,20 @@ def get_run_exit_code(run: Run) -> int:
     return 1
-def _get_run_termination_reason(run: Run) -> Tuple[Optional[JobTerminationReason], Optional[str]]:
+def _get_run_termination_reason_and_exit_status(
+    run: Run,
+) -> Tuple[Optional[JobTerminationReason], Optional[str], Optional[int]]:
     if len(run._run.jobs) == 0:
-        return None, None
+        return None, None, None
     job = run._run.jobs[0]
     if len(job.job_submissions) == 0:
-        return None, None
+        return None, None, None
     job_submission = job.job_submissions[0]
-    return job_submission.termination_reason, job_submission.termination_reason_message
+    return (
+        job_submission.termination_reason,
+        job_submission.termination_reason_message,
+        job_submission.exit_status,
+    )
 def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool:

dstack/_internal/cli/utils/run.py CHANGED Viewed

@@ -218,6 +218,11 @@ def _get_run_error(run: Run) -> str:
 def _get_job_error(job: Job) -> str:
-    if job.job_submissions[-1].termination_reason is None:
+    job_submission = job.job_submissions[-1]
+    termination_reason = job_submission.termination_reason
+    exit_status = job_submission.exit_status
+    if termination_reason is None:
         return ""
-    return job.job_submissions[-1].termination_reason.name
+    if exit_status:
+        return f"{termination_reason.name} {exit_status}"
+    return termination_reason.name

dstack/_internal/core/backends/azure/compute.py CHANGED Viewed

@@ -391,9 +391,12 @@ class VMImageVariant(enum.Enum):
 _SUPPORTED_VM_SERIES_PATTERNS = [
-    r"D(\d+)s_v3",  # Dsv3-series
-    r"E(\d+)i?s_v4",  # Esv4-series
+    # TODO: Support newer CPU series (Dsv6, Esv6).
+    # They are NVMe-only and require marking the VM image as NVMe.
+    r"D(\d+)s_v3",  # Dsv3-series (general purpose)
+    r"E(\d+)i?s_v4",  # Esv4-series (memory optimized)
     r"E(\d+)-(\d+)s_v4",  # Esv4-series (constrained vCPU)
+    r"F(\d+)s_v2",  # Fsv2-series (compute optimized)
     r"NC(\d+)s_v3",  # NCv3-series [V100 16GB]
     r"NC(\d+)as_T4_v3",  # NCasT4_v3-series [T4]
     r"ND(\d+)rs_v2",  # NDv2-series [8xV100 32GB]

dstack/_internal/core/backends/cudo/compute.py CHANGED Viewed

@@ -147,7 +147,7 @@ class CudoCompute(
 def _get_image_id(cuda: bool) -> str:
-    image_name = "ubuntu-2204-nvidia-535-docker-v20240214" if cuda else "ubuntu-2204"
+    image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204"
     return image_name

dstack/_internal/core/backends/nebius/fabrics.py CHANGED Viewed

@@ -20,6 +20,7 @@ INFINIBAND_FABRICS = [
     InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
     InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
     InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
+    InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
 ]

dstack/_internal/core/backends/nebius/models.py CHANGED Viewed

@@ -5,7 +5,7 @@ from pydantic import Field, root_validator
 from dstack._internal.core.backends.base.models import fill_data
 from dstack._internal.core.models.common import CoreModel
-DEFAULT_PROJECT_NAME_PREFIX = "default-project"
+DEFAULT_PROJECT_NAME_PREFIX = "default"
 class NebiusServiceAccountCreds(CoreModel):

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -23,6 +23,9 @@ ValidPort = conint(gt=0, le=65536)
 MAX_INT64 = 2**63 - 1
 SERVICE_HTTPS_DEFAULT = True
 STRIP_PREFIX_DEFAULT = True
+RUN_PRIOTIRY_MIN = 0
+RUN_PRIOTIRY_MAX = 100
+RUN_PRIORITY_DEFAULT = 0
 class RunConfigurationType(str, Enum):
@@ -77,7 +80,8 @@ class ScalingSpec(CoreModel):
         Field(
             description="The target value of the metric. "
             "The number of replicas is calculated based on this number and automatically adjusts "
-            "(scales up or down) as this metric changes"
+            "(scales up or down) as this metric changes",
+            gt=0,
         ),
     ]
     scale_up_delay: Annotated[
@@ -221,14 +225,26 @@ class BaseRunConfiguration(CoreModel):
             )
         ),
     ] = None
-    # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
-    setup: CommandsList = []
     resources: Annotated[
         ResourcesSpec, Field(description="The resources requirements to run the configuration")
     ] = ResourcesSpec()
+    priority: Annotated[
+        Optional[int],
+        Field(
+            ge=RUN_PRIOTIRY_MIN,
+            le=RUN_PRIOTIRY_MAX,
+            description=(
+                f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
+                " `dstack` tries to provision runs with higher priority first."
+                f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
+            ),
+        ),
+    ] = None
     volumes: Annotated[
         List[Union[MountPoint, str]], Field(description="The volumes mount points")
     ] = []
+    # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
+    setup: CommandsList = []
     @validator("python", pre=True, always=True)
     def convert_python(cls, v, values) -> Optional[PythonVersion]:

dstack/_internal/core/models/resources.py CHANGED Viewed

@@ -126,7 +126,7 @@ class ComputeCapability(Tuple[int, int]):
 DEFAULT_CPU_COUNT = Range[int](min=2)
 DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
-DEFAULT_GPU_COUNT = Range[int](min=1, max=1)
+DEFAULT_GPU_COUNT = Range[int](min=1)
 class CPUSpec(CoreModel):

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
     # Set by the server
     FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
     INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
+    INSTANCE_UNREACHABLE = "instance_unreachable"
     WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
     WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
     TERMINATED_BY_USER = "terminated_by_user"
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
         mapping = {
             self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
             self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
+            self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
             self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
             self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
             self.TERMINATED_BY_USER: JobStatus.TERMINATED,
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
     # or not applicable (container-based backends)
     ports: Optional[dict[int, int]] = None
     # List of volumes used by the job
-    volume_names: Optional[list[str]] = None  # None for backward compalibility
+    volume_names: Optional[list[str]] = None  # None for backward compatibility
     # Virtual shared offer
-    offer: Optional[InstanceOfferWithAvailability] = None  # None for backward compalibility
+    offer: Optional[InstanceOfferWithAvailability] = None  # None for backward compatibility
 class ClusterInfo(CoreModel):
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
     status: JobStatus
     termination_reason: Optional[JobTerminationReason]
     termination_reason_message: Optional[str]
+    exit_status: Optional[int]
     job_provisioning_data: Optional[JobProvisioningData]
     job_runtime_data: Optional[JobRuntimeData]
@@ -508,7 +511,9 @@ def _get_run_error(
         return ""
     if len(run_jobs) > 1:
         return run_termination_reason.name
-    run_job_termination_reason = _get_run_job_termination_reason(run_jobs)
+    run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
+        run_jobs
+    )
     # For failed runs, also show termination reason to provide more context.
     # For other run statuses, the job termination reason will duplicate run status.
     if run_job_termination_reason is not None and run_termination_reason in [
@@ -516,13 +521,20 @@ def _get_run_error(
         RunTerminationReason.SERVER_ERROR,
         RunTerminationReason.RETRY_LIMIT_EXCEEDED,
     ]:
+        if exit_status:
+            return (
+                f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
+            )
         return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
     return run_termination_reason.name
-def _get_run_job_termination_reason(run_jobs: List[Job]) -> Optional[JobTerminationReason]:
+def _get_run_job_termination_reason_and_exit_status(
+    run_jobs: List[Job],
+) -> tuple[Optional[JobTerminationReason], Optional[int]]:
     for job in run_jobs:
         if len(job.job_submissions) > 0:
-            if job.job_submissions[-1].termination_reason is not None:
-                return job.job_submissions[-1].termination_reason
-    return None
+            job_submission = job.job_submissions[-1]
+            if job_submission.termination_reason is not None:
+                return job_submission.termination_reason, job_submission.exit_status
+    return None, None

dstack/_internal/server/background/tasks/process_metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@ import asyncio
 import json
 from typing import Dict, List, Optional
-from sqlalchemy import delete, select
+from sqlalchemy import Delete, delete, select
 from sqlalchemy.orm import joinedload
 from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
@@ -42,11 +42,36 @@ async def collect_metrics():
 async def delete_metrics():
-    cutoff = _get_delete_metrics_cutoff()
+    now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000)
+    running_timestamp_micro_cutoff = (
+        now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000
+    )
+    finished_timestamp_micro_cutoff = (
+        now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
+    )
+    await asyncio.gather(
+        _execute_delete_statement(
+            delete(JobMetricsPoint).where(
+                JobMetricsPoint.job_id.in_(
+                    select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
+                ),
+                JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
+            )
+        ),
+        _execute_delete_statement(
+            delete(JobMetricsPoint).where(
+                JobMetricsPoint.job_id.in_(
+                    select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
+                ),
+                JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
+            )
+        ),
+    )
+async def _execute_delete_statement(stmt: Delete) -> None:
     async with get_session_ctx() as session:
-        await session.execute(
-            delete(JobMetricsPoint).where(JobMetricsPoint.timestamp_micro < cutoff)
-        )
+        await session.execute(stmt)
         await session.commit()
@@ -134,9 +159,3 @@ def _pull_runner_metrics(
 ) -> Optional[MetricsResponse]:
     runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
     return runner_client.get_metrics()
-def _get_delete_metrics_cutoff() -> int:
-    now = int(get_current_datetime().timestamp() * 1_000_000)
-    cutoff = now - (settings.SERVER_METRICS_TTL_SECONDS * 1_000_000)
-    return cutoff

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import asyncio
 from collections.abc import Iterable
-from datetime import timedelta
+from datetime import timedelta, timezone
 from typing import Dict, List, Optional
 from sqlalchemy import select
@@ -71,6 +71,12 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+# Minimum time before terminating active job in case of connectivity issues.
+# Should be sufficient to survive most problems caused by
+# the server network flickering and providers' glitches.
+JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
 async def process_running_jobs(batch_size: int = 1):
     tasks = []
     for _ in range(batch_size):
@@ -202,7 +208,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                 user_ssh_key = run.run_spec.ssh_key_pub.strip()
                 public_keys = [project.ssh_public_key.strip(), user_ssh_key]
                 if job_provisioning_data.backend == BackendType.LOCAL:
-                    # No need to update ~/.ssh/authorized_keys when running shim localy
+                    # No need to update ~/.ssh/authorized_keys when running shim locally
                     user_ssh_key = ""
                 success = await common_utils.run_async(
                     _process_provisioning_with_shim,
@@ -299,19 +305,38 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                 run_model,
                 job_model,
             )
-            if not success:
-                job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
-        if not success:  # kill the job
-            logger.warning(
-                "%s: failed because runner is not available or return an error,  age=%s",
-                fmt(job_model),
-                job_submission.age,
-            )
-            job_model.status = JobStatus.TERMINATING
-            if not job_model.termination_reason:
-                job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
-            # job will be terminated and instance will be emptied by process_terminating_jobs
+        if success:
+            job_model.disconnected_at = None
+        else:
+            if job_model.termination_reason:
+                logger.warning(
+                    "%s: failed because shim/runner returned an error, age=%s",
+                    fmt(job_model),
+                    job_submission.age,
+                )
+                job_model.status = JobStatus.TERMINATING
+                # job will be terminated and instance will be emptied by process_terminating_jobs
+            else:
+                # No job_model.termination_reason set means ssh connection failed
+                if job_model.disconnected_at is None:
+                    job_model.disconnected_at = common_utils.get_current_datetime()
+                if _should_terminate_job_due_to_disconnect(job_model):
+                    logger.warning(
+                        "%s: failed because instance is unreachable, age=%s",
+                        fmt(job_model),
+                        job_submission.age,
+                    )
+                    # TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
+                    # when CLI <= 0.19.8 is no longer supported
+                    job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
+                    job_model.status = JobStatus.TERMINATING
+                else:
+                    logger.warning(
+                        "%s: is unreachable, waiting for the instance to become reachable again, age=%s",
+                        fmt(job_model),
+                        job_submission.age,
+                    )
     if (
         initial_status != job_model.status
@@ -543,7 +568,7 @@ def _process_pulling_with_shim(
     if shim_client.is_api_v2_supported():  # raises error if shim is down, causes retry
         task = shim_client.get_task(job_model.id)
-        # If task goes to terminated before the job is submitted to runner, then an error occured
+        # If task goes to terminated before the job is submitted to runner, then an error occurred
         if task.status == TaskStatus.TERMINATED:
             logger.warning(
                 "shim failed to execute job %s: %s (%s)",
@@ -572,7 +597,7 @@ def _process_pulling_with_shim(
     else:
         shim_status = shim_client.pull()  # raises error if shim is down, causes retry
-        # If shim goes to pending before the job is submitted to runner, then an error occured
+        # If shim goes to pending before the job is submitted to runner, then an error occurred
         if (
             shim_status.state == "pending"
             and shim_status.result is not None
@@ -651,6 +676,10 @@ def _process_running(
                 )
             if latest_state_event.termination_message:
                 job_model.termination_reason_message = latest_state_event.termination_message
+        if (exit_status := latest_state_event.exit_status) is not None:
+            job_model.exit_status = exit_status
+            if exit_status != 0:
+                logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
     else:
         _terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
     if job_model.status != previous_status:
@@ -688,6 +717,15 @@ def _terminate_if_inactivity_duration_exceeded(
         )
+def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
+    if job_model.disconnected_at is None:
+        return False
+    return (
+        common_utils.get_current_datetime()
+        > job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
+    )
 async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
     policy = job.job_spec.utilization_policy
     if policy is None:
@@ -818,8 +856,8 @@ def _submit_job_to_runner(
         return success_if_not_available
     runner_client.submit_job(
-        run_spec=run.run_spec,
-        job_spec=job.job_spec,
+        run=run,
+        job=job,
         cluster_info=cluster_info,
         secrets=secrets,
         repo_credentials=repo_credentials,

dstack 0.19.8__py3-none-any.whl → 0.19.10__py3-none-any.whl

Potentially problematic release.

dstack 0.19.8py3-none-any.whl → 0.19.10py3-none-any.whl