PyPI - dstack - Versions diffs - 0.19.13__py3-none-any.whl → 0.19.14__py3-none-any.whl - Mend

dstack 0.19.13py3-none-any.whl → 0.19.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (35) hide show

dstack/_internal/cli/commands/attach.py CHANGED Viewed

@@ -52,9 +52,8 @@ class AttachCommand(APIBaseCommand):
         )
         self._parser.add_argument(
             "--replica",
-            help="The replica number. Defaults to 0.",
+            help="The replica number. Defaults to any running replica.",
             type=int,
-            default=0,
         )
         self._parser.add_argument(
             "--job",
@@ -129,14 +128,15 @@ _IGNORED_PORTS = [DSTACK_RUNNER_HTTP_PORT]
 def _print_attached_message(
     run: Run,
     bind_address: Optional[str],
-    replica_num: int,
+    replica_num: Optional[int],
     job_num: int,
 ):
     if bind_address is None:
         bind_address = "localhost"
-    output = f"Attached to run [code]{run.name}[/] (replica={replica_num} job={job_num})\n"
     job = get_or_error(run._find_job(replica_num=replica_num, job_num=job_num))
+    replica_num = job.job_spec.replica_num
+    output = f"Attached to run [code]{run.name}[/] (replica={replica_num} job={job_num})\n"
     name = run.name
     if replica_num != 0 or job_num != 0:
         name = job.job_spec.job_name

dstack/_internal/cli/services/configurators/run.py CHANGED Viewed

@@ -599,6 +599,7 @@ def _is_ready_to_attach(run: Run) -> bool:
         ]
         or run._run.jobs[0].job_submissions[-1].status
         in [JobStatus.SUBMITTED, JobStatus.PROVISIONING, JobStatus.PULLING]
+        or run._run.is_deployment_in_progress()
     )

dstack/_internal/cli/utils/run.py CHANGED Viewed

@@ -162,9 +162,16 @@ def get_runs_table(
     for run in runs:
         run = run._run  # TODO(egor-s): make public attribute
+        show_deployment_num = (
+            verbose
+            and run.run_spec.configuration.type == "service"
+            or run.is_deployment_in_progress()
+        )
+        merge_job_rows = len(run.jobs) == 1 and not show_deployment_num
         run_row: Dict[Union[str, int], Any] = {
-            "NAME": run.run_spec.run_name,
+            "NAME": run.run_spec.run_name
+            + (f" [secondary]deployment={run.deployment_num}[/]" if show_deployment_num else ""),
             "SUBMITTED": format_date(run.submitted_at),
             "STATUS": (
                 run.latest_job_submission.status_message
@@ -174,7 +181,7 @@ def get_runs_table(
         }
         if run.error:
             run_row["ERROR"] = run.error
-        if len(run.jobs) != 1:
+        if not merge_job_rows:
             add_row_from_dict(table, run_row)
         for job in run.jobs:
@@ -184,7 +191,12 @@ def get_runs_table(
                 inactive_for = format_duration_multiunit(latest_job_submission.inactivity_secs)
                 status += f" (inactive for {inactive_for})"
             job_row: Dict[Union[str, int], Any] = {
-                "NAME": f"  replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
+                "NAME": f"  replica={job.job_spec.replica_num} job={job.job_spec.job_num}"
+                + (
+                    f" deployment={latest_job_submission.deployment_num}"
+                    if show_deployment_num
+                    else ""
+                ),
                 "STATUS": latest_job_submission.status_message,
                 "SUBMITTED": format_date(latest_job_submission.submitted_at),
                 "ERROR": latest_job_submission.error,
@@ -208,7 +220,7 @@ def get_runs_table(
                         "PRICE": f"${jpd.price:.4f}".rstrip("0").rstrip("."),
                     }
                 )
-            if len(run.jobs) == 1:
+            if merge_job_rows:
                 # merge rows
                 job_row.update(run_row)
             add_row_from_dict(table, job_row, style="secondary" if len(run.jobs) != 1 else None)

dstack/_internal/core/compatibility/runs.py CHANGED Viewed

@@ -19,6 +19,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
     if current_resource is not None:
         current_resource_excludes = {}
         current_resource_excludes["status_message"] = True
+        if current_resource.deployment_num == 0:
+            current_resource_excludes["deployment_num"] = True
         apply_plan_excludes["current_resource"] = current_resource_excludes
         current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
         job_submissions_excludes = {}
@@ -36,6 +38,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
             }
         if all(js.exit_status is None for js in job_submissions):
             job_submissions_excludes["exit_status"] = True
+        if all(js.deployment_num == 0 for js in job_submissions):
+            job_submissions_excludes["deployment_num"] = True
         latest_job_submission = current_resource.latest_job_submission
         if latest_job_submission is not None:
             latest_job_submission_excludes = {}
@@ -50,6 +54,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
                 }
             if latest_job_submission.exit_status is None:
                 latest_job_submission_excludes["exit_status"] = True
+            if latest_job_submission.deployment_num == 0:
+                latest_job_submission_excludes["deployment_num"] = True
     return {"plan": apply_plan_excludes}

dstack/_internal/core/models/projects.py CHANGED Viewed

@@ -25,3 +25,4 @@ class Project(CoreModel):
     created_at: Optional[datetime] = None
     backends: List[BackendInfo]
     members: List[Member]
+    is_public: bool = False

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -148,6 +148,19 @@ class JobTerminationReason(str, Enum):
         }
         return mapping[self]
+    def to_retry_event(self) -> Optional[RetryEvent]:
+        """
+        Returns:
+            the retry event this termination reason triggers
+            or None if this termination reason should not be retried
+        """
+        mapping = {
+            self.FAILED_TO_START_DUE_TO_NO_CAPACITY: RetryEvent.NO_CAPACITY,
+            self.INTERRUPTED_BY_NO_CAPACITY: RetryEvent.INTERRUPTION,
+        }
+        default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
+        return mapping.get(self, default)
 class Requirements(CoreModel):
     # TODO: Make requirements' fields required
@@ -276,6 +289,7 @@ class ClusterInfo(CoreModel):
 class JobSubmission(CoreModel):
     id: UUID4
     submission_num: int
+    deployment_num: int = 0  # default for compatibility with pre-0.19.14 servers
     submitted_at: datetime
     last_processed_at: datetime
     finished_at: Optional[datetime]
@@ -354,7 +368,7 @@ class JobSubmission(CoreModel):
         error_mapping = {
             JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
             JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
-            JobTerminationReason.VOLUME_ERROR: "waiting runner limit exceeded",
+            JobTerminationReason.VOLUME_ERROR: "volume error",
             JobTerminationReason.GATEWAY_ERROR: "gateway error",
             JobTerminationReason.SCALED_DOWN: "scaled down",
             JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
@@ -503,6 +517,7 @@ class Run(CoreModel):
     latest_job_submission: Optional[JobSubmission]
     cost: float = 0
     service: Optional[ServiceSpec] = None
+    deployment_num: int = 0  # default for compatibility with pre-0.19.14 servers
     # TODO: make error a computed field after migrating to pydanticV2
     error: Optional[str] = None
     deleted: Optional[bool] = None
@@ -565,6 +580,13 @@ class Run(CoreModel):
             return "retrying"
         return status.value
+    def is_deployment_in_progress(self) -> bool:
+        return any(
+            not j.job_submissions[-1].status.is_finished()
+            and j.job_submissions[-1].deployment_num != self.deployment_num
+            for j in self.jobs
+        )
 class JobPlan(CoreModel):
     job_spec: JobSpec

dstack/_internal/server/app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from fastapi import FastAPI, Request, Response, status
 from fastapi.datastructures import URL
 from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
 from fastapi.staticfiles import StaticFiles
+from prometheus_client import Counter, Histogram
 from dstack._internal.cli.utils.common import console
 from dstack._internal.core.errors import ForbiddenError, ServerClientError
@@ -63,6 +64,18 @@ from dstack._internal.utils.ssh import check_required_ssh_version
 logger = get_logger(__name__)
+# Server HTTP metrics
+REQUESTS_TOTAL = Counter(
+    "dstack_server_requests_total",
+    "Total number of HTTP requests",
+    ["method", "endpoint", "http_status", "project_name"],
+)
+REQUEST_DURATION = Histogram(
+    "dstack_server_request_duration_seconds",
+    "HTTP request duration in seconds",
+    ["method", "endpoint", "http_status", "project_name"],
+)
 def create_app() -> FastAPI:
     if settings.SENTRY_DSN is not None:
@@ -216,6 +229,8 @@ def register_routes(app: FastAPI, ui: bool = True):
         start_time = time.time()
         response: Response = await call_next(request)
         process_time = time.time() - start_time
+        # log process_time to be used in the log_http_metrics middleware
+        request.state.process_time = process_time
         logger.debug(
             "Processed request %s %s in %s. Status: %s",
             request.method,
@@ -225,6 +240,36 @@ def register_routes(app: FastAPI, ui: bool = True):
         )
         return response
+    # this middleware must be defined after the log_request middleware
+    @app.middleware("http")
+    async def log_http_metrics(request: Request, call_next):
+        def _extract_project_name(request: Request):
+            project_name = None
+            prefix = "/api/project/"
+            if request.url.path.startswith(prefix):
+                rest = request.url.path[len(prefix) :]
+                project_name = rest.split("/", 1)[0] if rest else None
+            return project_name
+        project_name = _extract_project_name(request)
+        response: Response = await call_next(request)
+        REQUEST_DURATION.labels(
+            method=request.method,
+            endpoint=request.url.path,
+            http_status=response.status_code,
+            project_name=project_name,
+        ).observe(request.state.process_time)
+        REQUESTS_TOTAL.labels(
+            method=request.method,
+            endpoint=request.url.path,
+            http_status=response.status_code,
+            project_name=project_name,
+        ).inc()
+        return response
     @app.middleware("http")
     async def check_client_version(request: Request, call_next):
         if (

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+import re
 from collections.abc import Iterable
 from datetime import timedelta, timezone
 from typing import Dict, List, Optional
@@ -7,6 +8,7 @@ from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
+from dstack._internal import settings
 from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT
 from dstack._internal.core.errors import GatewayError
 from dstack._internal.core.models.backends.base import BackendType
@@ -517,14 +519,14 @@ def _process_provisioning_with_shim(
         cpu = None
         memory = None
         network_mode = NetworkMode.HOST
+    image_name = _patch_base_image_for_aws_efa(job_spec, job_provisioning_data)
     if shim_client.is_api_v2_supported():
         shim_client.submit_task(
             task_id=job_model.id,
             name=job_model.job_name,
             registry_username=registry_username,
             registry_password=registry_password,
-            image_name=job_spec.image_name,
+            image_name=image_name,
             container_user=container_user,
             privileged=job_spec.privileged,
             gpu=gpu,
@@ -545,7 +547,7 @@ def _process_provisioning_with_shim(
         submitted = shim_client.submit(
             username=registry_username,
             password=registry_password,
-            image_name=job_spec.image_name,
+            image_name=image_name,
             privileged=job_spec.privileged,
             container_name=job_model.job_name,
             container_user=container_user,
@@ -969,3 +971,43 @@ def _get_instance_specific_gpu_devices(
             GPUDevice(path_on_host="/dev/nvidiactl", path_in_container="/dev/nvidiactl")
         )
     return gpu_devices
+def _patch_base_image_for_aws_efa(
+    job_spec: JobSpec, job_provisioning_data: JobProvisioningData
+) -> str:
+    image_name = job_spec.image_name
+    if job_provisioning_data.backend != BackendType.AWS:
+        return image_name
+    instance_type = job_provisioning_data.instance_type.name
+    efa_enabled_patterns = [
+        # TODO: p6-b200 isn't supported yet in gpuhunt
+        r"^p6-b200\.(48xlarge)$",
+        r"^p5\.(48xlarge)$",
+        r"^p5e\.(48xlarge)$",
+        r"^p5en\.(48xlarge)$",
+        r"^p4d\.(24xlarge)$",
+        r"^p4de\.(24xlarge)$",
+        r"^g6\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
+        r"^g6e\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
+        r"^gr6\.8xlarge$",
+        r"^g5\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
+        r"^g4dn\.(8xlarge|12xlarge|16xlarge|metal)$",
+        r"^p3dn\.(24xlarge)$",
+    ]
+    is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns)
+    if not is_efa_enabled:
+        return image_name
+    if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"):
+        return image_name
+    if image_name.endswith(f"-base-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"):
+        return image_name[:-17] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"
+    elif image_name.endswith(f"-devel-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"):
+        return image_name[:-18] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"
+    return image_name

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -1,18 +1,17 @@
 import asyncio
 import datetime
-import itertools
 from typing import List, Optional, Set, Tuple
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload, selectinload
-import dstack._internal.server.services.gateways as gateways
 import dstack._internal.server.services.services.autoscalers as autoscalers
 from dstack._internal.core.errors import ServerError
 from dstack._internal.core.models.profiles import RetryEvent, StopCriteria
 from dstack._internal.core.models.runs import (
     Job,
+    JobSpec,
     JobStatus,
     JobTerminationReason,
     Run,
@@ -24,22 +23,23 @@ from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import JobModel, ProjectModel, RunModel
 from dstack._internal.server.services.jobs import (
     find_job,
-    get_jobs_from_run_spec,
+    get_job_specs_from_run_spec,
     group_jobs_by_replica_latest,
 )
 from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.runs import (
-    create_job_model_for_new_submission,
     fmt,
     process_terminating_run,
     retry_run_replica_jobs,
     run_model_to_run,
     scale_run_replicas,
 )
+from dstack._internal.server.services.services import update_service_desired_replica_count
 from dstack._internal.utils import common
 from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+ROLLING_DEPLOYMENT_MAX_SURGE = 1  # at most one extra replica during rolling deployment
 async def process_runs(batch_size: int = 1):
@@ -133,46 +133,22 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
         logger.debug("%s: pending run is not yet ready for resubmission", fmt(run_model))
         return
-    # TODO(egor-s) consolidate with `scale_run_replicas` if possible
-    replicas = 1
+    run_model.desired_replica_count = 1
     if run.run_spec.configuration.type == "service":
-        replicas = run.run_spec.configuration.replicas.min or 0  # new default
-        scaler = autoscalers.get_service_scaler(run.run_spec.configuration)
-        stats = None
-        if run_model.gateway_id is not None:
-            conn = await gateways.get_or_add_gateway_connection(session, run_model.gateway_id)
-            stats = await conn.get_stats(run_model.project.name, run_model.run_name)
-        # replicas info doesn't matter for now
-        replicas = scaler.scale([], stats)
-    if replicas == 0:
+        run_model.desired_replica_count = run.run_spec.configuration.replicas.min or 0
+        await update_service_desired_replica_count(
+            session,
+            run_model,
+            run.run_spec.configuration,
+            # does not matter for pending services, since 0->n scaling should happen without delay
+            last_scaled_at=None,
+        )
+    if run_model.desired_replica_count == 0:
         # stay zero scaled
         return
-    scheduled_replicas = 0
-    # Resubmit existing replicas
-    for replica_num, replica_jobs in itertools.groupby(
-        run.jobs, key=lambda j: j.job_spec.replica_num
-    ):
-        if scheduled_replicas >= replicas:
-            break
-        scheduled_replicas += 1
-        for job in replica_jobs:
-            new_job_model = create_job_model_for_new_submission(
-                run_model=run_model,
-                job=job,
-                status=JobStatus.SUBMITTED,
-            )
-            session.add(new_job_model)
-    # Create missing replicas
-    for replica_num in range(scheduled_replicas, replicas):
-        jobs = await get_jobs_from_run_spec(run.run_spec, replica_num=replica_num)
-        for job in jobs:
-            job_model = create_job_model_for_new_submission(
-                run_model=run_model,
-                job=job,
-                status=JobStatus.SUBMITTED,
-            )
-            session.add(job_model)
+    await scale_run_replicas(session, run_model, replicas_diff=run_model.desired_replica_count)
     run_model.status = RunStatus.SUBMITTED
     logger.info("%s: run status has changed PENDING -> SUBMITTED", fmt(run_model))
@@ -340,27 +316,11 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
                     job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
     if new_status not in {RunStatus.TERMINATING, RunStatus.PENDING}:
-        # No need to retry if the run is terminating,
+        # No need to retry, scale, or redeploy replicas if the run is terminating,
         # pending run will retry replicas in `process_pending_run`
-        for _, replica_jobs in replicas_to_retry:
-            await retry_run_replica_jobs(
-                session, run_model, replica_jobs, only_failed=retry_single_job
-            )
-        if run_spec.configuration.type == "service":
-            scaler = autoscalers.get_service_scaler(run_spec.configuration)
-            stats = None
-            if run_model.gateway_id is not None:
-                conn = await gateways.get_or_add_gateway_connection(session, run_model.gateway_id)
-                stats = await conn.get_stats(run_model.project.name, run_model.run_name)
-            # use replicas_info from before retrying
-            replicas_diff = scaler.scale(replicas_info, stats)
-            if replicas_diff != 0:
-                # FIXME: potentially long write transaction
-                # Why do we flush here?
-                await session.flush()
-                await session.refresh(run_model)
-                await scale_run_replicas(session, run_model, replicas_diff)
+        await _handle_run_replicas(
+            session, run_model, run_spec, replicas_to_retry, retry_single_job, replicas_info
+        )
     if run_model.status != new_status:
         logger.info(
@@ -378,6 +338,130 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             run_model.resubmission_attempt += 1
+async def _handle_run_replicas(
+    session: AsyncSession,
+    run_model: RunModel,
+    run_spec: RunSpec,
+    replicas_to_retry: list[tuple[int, list[JobModel]]],
+    retry_single_job: bool,
+    replicas_info: list[autoscalers.ReplicaInfo],
+) -> None:
+    """
+    Does ONE of:
+    - replica retry
+    - replica scaling
+    - replica rolling deployment
+    Does not do everything at once to avoid conflicts between the stages and long DB transactions.
+    """
+    if replicas_to_retry:
+        for _, replica_jobs in replicas_to_retry:
+            await retry_run_replica_jobs(
+                session, run_model, replica_jobs, only_failed=retry_single_job
+            )
+        return
+    if run_spec.configuration.type == "service":
+        await update_service_desired_replica_count(
+            session,
+            run_model,
+            run_spec.configuration,
+            # FIXME: should only include scaling events, not retries and deployments
+            last_scaled_at=max((r.timestamp for r in replicas_info), default=None),
+        )
+    max_replica_count = run_model.desired_replica_count
+    if _has_out_of_date_replicas(run_model):
+        # allow extra replicas when deployment is in progress
+        max_replica_count += ROLLING_DEPLOYMENT_MAX_SURGE
+    active_replica_count = sum(1 for r in replicas_info if r.active)
+    if active_replica_count not in range(run_model.desired_replica_count, max_replica_count + 1):
+        await scale_run_replicas(
+            session,
+            run_model,
+            replicas_diff=run_model.desired_replica_count - active_replica_count,
+        )
+        return
+    await _update_jobs_to_new_deployment_in_place(run_model, run_spec)
+    if _has_out_of_date_replicas(run_model):
+        non_terminated_replica_count = len(
+            {j.replica_num for j in run_model.jobs if not j.status.is_finished()}
+        )
+        # Avoid using too much hardware during a deployment - never have
+        # more than max_replica_count non-terminated replicas.
+        if non_terminated_replica_count < max_replica_count:
+            # Start more up-to-date replicas that will eventually replace out-of-date replicas.
+            await scale_run_replicas(
+                session,
+                run_model,
+                replicas_diff=max_replica_count - non_terminated_replica_count,
+            )
+        replicas_to_stop_count = 0
+        # stop any out-of-date replicas that are not running
+        replicas_to_stop_count += len(
+            {
+                j.replica_num
+                for j in run_model.jobs
+                if j.status
+                not in [JobStatus.RUNNING, JobStatus.TERMINATING] + JobStatus.finished_statuses()
+                and j.deployment_num < run_model.deployment_num
+            }
+        )
+        running_replica_count = len(
+            {j.replica_num for j in run_model.jobs if j.status == JobStatus.RUNNING}
+        )
+        if running_replica_count > run_model.desired_replica_count:
+            # stop excessive running out-of-date replicas
+            replicas_to_stop_count += running_replica_count - run_model.desired_replica_count
+        if replicas_to_stop_count:
+            await scale_run_replicas(
+                session,
+                run_model,
+                replicas_diff=-replicas_to_stop_count,
+            )
+async def _update_jobs_to_new_deployment_in_place(run_model: RunModel, run_spec: RunSpec) -> None:
+    """
+    Bump deployment_num for jobs that do not require redeployment.
+    """
+    for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
+        if all(j.status.is_finished() for j in job_models):
+            continue
+        if all(j.deployment_num == run_model.deployment_num for j in job_models):
+            continue
+        new_job_specs = await get_job_specs_from_run_spec(
+            run_spec=run_spec,
+            replica_num=replica_num,
+        )
+        assert len(new_job_specs) == len(job_models), (
+            "Changing the number of jobs within a replica is not yet supported"
+        )
+        can_update_all_jobs = True
+        for old_job_model, new_job_spec in zip(job_models, new_job_specs):
+            old_job_spec = JobSpec.__response__.parse_raw(old_job_model.job_spec_data)
+            if new_job_spec != old_job_spec:
+                can_update_all_jobs = False
+                break
+        if can_update_all_jobs:
+            for job_model in job_models:
+                job_model.deployment_num = run_model.deployment_num
+def _has_out_of_date_replicas(run: RunModel) -> bool:
+    for job in run.jobs:
+        if job.deployment_num < run.deployment_num and not (
+            job.status.is_finished() or job.termination_reason == JobTerminationReason.SCALED_DOWN
+        ):
+            return True
+    return False
 def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datetime.timedelta]:
     """
     Checks if the job should be retried.
@@ -393,7 +477,8 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet
             break
     if (
-        job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
+        job_model.termination_reason is not None
+        and job_model.termination_reason.to_retry_event() == RetryEvent.NO_CAPACITY
         and last_provisioned_submission is None
         and RetryEvent.NO_CAPACITY in job.job_spec.retry.on_events
     ):
@@ -403,24 +488,9 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet
         return None
     if (
-        last_provisioned_submission.termination_reason
-        == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
-        and RetryEvent.INTERRUPTION in job.job_spec.retry.on_events
-    ):
-        return common.get_current_datetime() - last_provisioned_submission.last_processed_at
-    if (
-        last_provisioned_submission.termination_reason
-        in [
-            JobTerminationReason.CONTAINER_EXITED_WITH_ERROR,
-            JobTerminationReason.CREATING_CONTAINER_ERROR,
-            JobTerminationReason.EXECUTOR_ERROR,
-            JobTerminationReason.GATEWAY_ERROR,
-            JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED,
-            JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED,
-            JobTerminationReason.PORTS_BINDING_FAILED,
-        ]
-        and RetryEvent.ERROR in job.job_spec.retry.on_events
+        last_provisioned_submission.termination_reason is not None
+        and last_provisioned_submission.termination_reason.to_retry_event()
+        in job.job_spec.retry.on_events
     ):
         return common.get_current_datetime() - last_provisioned_submission.last_processed_at

dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Add rolling deployment fields
+Revision ID: 35e90e1b0d3e
+Revises: 35f732ee4cf5
+Create Date: 2025-05-29 15:30:27.878569
+"""
+import sqlalchemy as sa
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "35e90e1b0d3e"
+down_revision = "35f732ee4cf5"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.execute("UPDATE jobs SET deployment_num = 0")
+        batch_op.alter_column("deployment_num", nullable=False)
+    with op.batch_alter_table("runs", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
+        batch_op.add_column(sa.Column("desired_replica_count", sa.Integer(), nullable=True))
+    with op.batch_alter_table("runs", schema=None) as batch_op:
+        batch_op.execute("UPDATE runs SET deployment_num = 0")
+        batch_op.execute("UPDATE runs SET desired_replica_count = 1")
+        batch_op.alter_column("deployment_num", nullable=False)
+        batch_op.alter_column("desired_replica_count", nullable=False)
+def downgrade() -> None:
+    with op.batch_alter_table("runs", schema=None) as batch_op:
+        batch_op.drop_column("deployment_num")
+        batch_op.drop_column("desired_replica_count")
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.drop_column("deployment_num")

dstack 0.19.13__py3-none-any.whl → 0.19.14__py3-none-any.whl

Potentially problematic release.

dstack 0.19.13py3-none-any.whl → 0.19.14py3-none-any.whl