PyPI - dstack - Versions diffs - 0.19.12rc1__py3-none-any.whl → 0.19.14__py3-none-any.whl - Mend

dstack 0.19.12rc1py3-none-any.whl → 0.19.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (62) hide show

dstack/_internal/cli/commands/attach.py +4 -4
dstack/_internal/cli/services/configurators/run.py +44 -47
dstack/_internal/cli/utils/run.py +31 -31
dstack/_internal/core/backends/aws/compute.py +22 -9
dstack/_internal/core/backends/aws/resources.py +26 -0
dstack/_internal/core/backends/base/offers.py +0 -1
dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
dstack/_internal/core/backends/template/models.py.jinja +4 -0
dstack/_internal/core/compatibility/__init__.py +0 -0
dstack/_internal/core/compatibility/fleets.py +72 -0
dstack/_internal/core/compatibility/gateways.py +34 -0
dstack/_internal/core/compatibility/runs.py +131 -0
dstack/_internal/core/compatibility/volumes.py +32 -0
dstack/_internal/core/models/configurations.py +1 -1
dstack/_internal/core/models/fleets.py +6 -1
dstack/_internal/core/models/instances.py +51 -12
dstack/_internal/core/models/profiles.py +43 -3
dstack/_internal/core/models/projects.py +1 -0
dstack/_internal/core/models/repos/local.py +3 -3
dstack/_internal/core/models/runs.py +139 -43
dstack/_internal/server/app.py +46 -1
dstack/_internal/server/background/tasks/process_running_jobs.py +92 -15
dstack/_internal/server/background/tasks/process_runs.py +163 -80
dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
dstack/_internal/server/models.py +4 -0
dstack/_internal/server/routers/projects.py +4 -3
dstack/_internal/server/routers/prometheus.py +4 -1
dstack/_internal/server/schemas/projects.py +1 -0
dstack/_internal/server/security/permissions.py +36 -0
dstack/_internal/server/services/jobs/__init__.py +1 -0
dstack/_internal/server/services/jobs/configurators/base.py +11 -7
dstack/_internal/server/services/projects.py +54 -1
dstack/_internal/server/services/runner/client.py +4 -1
dstack/_internal/server/services/runs.py +49 -29
dstack/_internal/server/services/services/__init__.py +19 -0
dstack/_internal/server/services/services/autoscalers.py +37 -26
dstack/_internal/server/services/storage/__init__.py +38 -0
dstack/_internal/server/services/storage/base.py +27 -0
dstack/_internal/server/services/storage/gcs.py +44 -0
dstack/_internal/server/services/{storage.py → storage/s3.py} +4 -27
dstack/_internal/server/settings.py +7 -3
dstack/_internal/server/statics/index.html +1 -1
dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js → main-0ac1e1583684417ae4d1.js} +1695 -62
dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js.map → main-0ac1e1583684417ae4d1.js.map} +1 -1
dstack/_internal/server/statics/{main-8f9c66f404e9c7e7e020.css → main-f39c418b05fe14772dd8.css} +1 -1
dstack/_internal/server/testing/common.py +11 -1
dstack/_internal/settings.py +3 -0
dstack/_internal/utils/common.py +4 -0
dstack/api/_public/runs.py +14 -5
dstack/api/server/_fleets.py +9 -69
dstack/api/server/_gateways.py +3 -14
dstack/api/server/_projects.py +2 -2
dstack/api/server/_runs.py +4 -116
dstack/api/server/_volumes.py +3 -14
dstack/plugins/builtin/rest_plugin/_plugin.py +24 -5
dstack/version.py +2 -2
{dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/METADATA +1 -1
{dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/RECORD +62 -52
{dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/WHEEL +0 -0
{dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/entry_points.txt +0 -0
{dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/server/app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from fastapi import FastAPI, Request, Response, status
 from fastapi.datastructures import URL
 from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse
 from fastapi.staticfiles import StaticFiles
+from prometheus_client import Counter, Histogram
 from dstack._internal.cli.utils.common import console
 from dstack._internal.core.errors import ForbiddenError, ServerClientError
@@ -63,6 +64,18 @@ from dstack._internal.utils.ssh import check_required_ssh_version
 logger = get_logger(__name__)
+# Server HTTP metrics
+REQUESTS_TOTAL = Counter(
+    "dstack_server_requests_total",
+    "Total number of HTTP requests",
+    ["method", "endpoint", "http_status", "project_name"],
+)
+REQUEST_DURATION = Histogram(
+    "dstack_server_request_duration_seconds",
+    "HTTP request duration in seconds",
+    ["method", "endpoint", "http_status", "project_name"],
+)
 def create_app() -> FastAPI:
     if settings.SENTRY_DSN is not None:
@@ -128,7 +141,7 @@ async def lifespan(app: FastAPI):
         yes=UPDATE_DEFAULT_PROJECT,
         no=DO_NOT_UPDATE_DEFAULT_PROJECT,
     )
-    if settings.SERVER_BUCKET is not None:
+    if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
         init_default_storage()
     scheduler = start_background_tasks()
     dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
@@ -216,6 +229,8 @@ def register_routes(app: FastAPI, ui: bool = True):
         start_time = time.time()
         response: Response = await call_next(request)
         process_time = time.time() - start_time
+        # log process_time to be used in the log_http_metrics middleware
+        request.state.process_time = process_time
         logger.debug(
             "Processed request %s %s in %s. Status: %s",
             request.method,
@@ -225,6 +240,36 @@ def register_routes(app: FastAPI, ui: bool = True):
         )
         return response
+    # this middleware must be defined after the log_request middleware
+    @app.middleware("http")
+    async def log_http_metrics(request: Request, call_next):
+        def _extract_project_name(request: Request):
+            project_name = None
+            prefix = "/api/project/"
+            if request.url.path.startswith(prefix):
+                rest = request.url.path[len(prefix) :]
+                project_name = rest.split("/", 1)[0] if rest else None
+            return project_name
+        project_name = _extract_project_name(request)
+        response: Response = await call_next(request)
+        REQUEST_DURATION.labels(
+            method=request.method,
+            endpoint=request.url.path,
+            http_status=response.status_code,
+            project_name=project_name,
+        ).observe(request.state.process_time)
+        REQUESTS_TOTAL.labels(
+            method=request.method,
+            endpoint=request.url.path,
+            http_status=response.status_code,
+            project_name=project_name,
+        ).inc()
+        return response
     @app.middleware("http")
     async def check_client_version(request: Request, call_next):
         if (

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+import re
 from collections.abc import Iterable
 from datetime import timedelta, timezone
 from typing import Dict, List, Optional
@@ -7,6 +8,7 @@ from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
+from dstack._internal import settings
 from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT
 from dstack._internal.core.errors import GatewayError
 from dstack._internal.core.models.backends.base import BackendType
@@ -18,6 +20,7 @@ from dstack._internal.core.models.instances import (
     SSHConnectionParams,
 )
 from dstack._internal.core.models.metrics import Metric
+from dstack._internal.core.models.profiles import StartupOrder
 from dstack._internal.core.models.repos import RemoteRepoCreds
 from dstack._internal.core.models.runs import (
     ClusterInfo,
@@ -184,18 +187,10 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         if job_provisioning_data.hostname is None:
             await _wait_for_instance_provisioning_data(job_model=job_model)
         else:
-            # Wait until all other jobs in the replica have IPs assigned.
-            # This is needed to ensure cluster_info has all IPs set.
-            for other_job in run.jobs:
-                if (
-                    other_job.job_spec.replica_num == job.job_spec.replica_num
-                    and other_job.job_submissions[-1].status == JobStatus.PROVISIONING
-                    and other_job.job_submissions[-1].job_provisioning_data is not None
-                    and other_job.job_submissions[-1].job_provisioning_data.hostname is None
-                ):
-                    job_model.last_processed_at = common_utils.get_current_datetime()
-                    await session.commit()
-                    return
+            if _should_wait_for_other_nodes(run, job, job_model):
+                job_model.last_processed_at = common_utils.get_current_datetime()
+                await session.commit()
+                return
             # fails are acceptable until timeout is exceeded
             if job_provisioning_data.dockerized:
@@ -406,6 +401,48 @@ async def _wait_for_instance_provisioning_data(job_model: JobModel):
     job_model.job_provisioning_data = job_model.instance.job_provisioning_data
+def _should_wait_for_other_nodes(run: Run, job: Job, job_model: JobModel) -> bool:
+    for other_job in run.jobs:
+        if (
+            other_job.job_spec.replica_num == job.job_spec.replica_num
+            and other_job.job_submissions[-1].status == JobStatus.PROVISIONING
+            and other_job.job_submissions[-1].job_provisioning_data is not None
+            and other_job.job_submissions[-1].job_provisioning_data.hostname is None
+        ):
+            logger.debug(
+                "%s: waiting for other job to have IP assigned",
+                fmt(job_model),
+            )
+            return True
+    master_job = find_job(run.jobs, job.job_spec.replica_num, 0)
+    if (
+        job.job_spec.job_num != 0
+        and run.run_spec.merged_profile.startup_order == StartupOrder.MASTER_FIRST
+        and master_job.job_submissions[-1].status != JobStatus.RUNNING
+    ):
+        logger.debug(
+            "%s: waiting for master job to become running",
+            fmt(job_model),
+        )
+        return True
+    if (
+        job.job_spec.job_num == 0
+        and run.run_spec.merged_profile.startup_order == StartupOrder.WORKERS_FIRST
+    ):
+        for other_job in run.jobs:
+            if (
+                other_job.job_spec.replica_num == job.job_spec.replica_num
+                and other_job.job_spec.job_num != job.job_spec.job_num
+                and other_job.job_submissions[-1].status != JobStatus.RUNNING
+            ):
+                logger.debug(
+                    "%s: waiting for worker job to become running",
+                    fmt(job_model),
+                )
+                return True
+    return False
 @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
 def _process_provisioning_with_shim(
     ports: Dict[int, int],
@@ -482,14 +519,14 @@ def _process_provisioning_with_shim(
         cpu = None
         memory = None
         network_mode = NetworkMode.HOST
+    image_name = _patch_base_image_for_aws_efa(job_spec, job_provisioning_data)
     if shim_client.is_api_v2_supported():
         shim_client.submit_task(
             task_id=job_model.id,
             name=job_model.job_name,
             registry_username=registry_username,
             registry_password=registry_password,
-            image_name=job_spec.image_name,
+            image_name=image_name,
             container_user=container_user,
             privileged=job_spec.privileged,
             gpu=gpu,
@@ -510,7 +547,7 @@ def _process_provisioning_with_shim(
         submitted = shim_client.submit(
             username=registry_username,
             password=registry_password,
-            image_name=job_spec.image_name,
+            image_name=image_name,
             privileged=job_spec.privileged,
             container_name=job_model.job_name,
             container_user=container_user,
@@ -934,3 +971,43 @@ def _get_instance_specific_gpu_devices(
             GPUDevice(path_on_host="/dev/nvidiactl", path_in_container="/dev/nvidiactl")
         )
     return gpu_devices
+def _patch_base_image_for_aws_efa(
+    job_spec: JobSpec, job_provisioning_data: JobProvisioningData
+) -> str:
+    image_name = job_spec.image_name
+    if job_provisioning_data.backend != BackendType.AWS:
+        return image_name
+    instance_type = job_provisioning_data.instance_type.name
+    efa_enabled_patterns = [
+        # TODO: p6-b200 isn't supported yet in gpuhunt
+        r"^p6-b200\.(48xlarge)$",
+        r"^p5\.(48xlarge)$",
+        r"^p5e\.(48xlarge)$",
+        r"^p5en\.(48xlarge)$",
+        r"^p4d\.(24xlarge)$",
+        r"^p4de\.(24xlarge)$",
+        r"^g6\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
+        r"^g6e\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
+        r"^gr6\.8xlarge$",
+        r"^g5\.(8xlarge|12xlarge|16xlarge|24xlarge|48xlarge)$",
+        r"^g4dn\.(8xlarge|12xlarge|16xlarge|metal)$",
+        r"^p3dn\.(24xlarge)$",
+    ]
+    is_efa_enabled = any(re.match(pattern, instance_type) for pattern in efa_enabled_patterns)
+    if not is_efa_enabled:
+        return image_name
+    if not image_name.startswith(f"{settings.DSTACK_BASE_IMAGE}:"):
+        return image_name
+    if image_name.endswith(f"-base-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"):
+        return image_name[:-17] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"
+    elif image_name.endswith(f"-devel-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"):
+        return image_name[:-18] + f"-devel-efa-ubuntu{settings.DSTACK_BASE_IMAGE_UBUNTU_VERSION}"
+    return image_name

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -1,18 +1,17 @@
 import asyncio
 import datetime
-import itertools
 from typing import List, Optional, Set, Tuple
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload, selectinload
-import dstack._internal.server.services.gateways as gateways
 import dstack._internal.server.services.services.autoscalers as autoscalers
 from dstack._internal.core.errors import ServerError
-from dstack._internal.core.models.profiles import RetryEvent
+from dstack._internal.core.models.profiles import RetryEvent, StopCriteria
 from dstack._internal.core.models.runs import (
     Job,
+    JobSpec,
     JobStatus,
     JobTerminationReason,
     Run,
@@ -24,22 +23,23 @@ from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import JobModel, ProjectModel, RunModel
 from dstack._internal.server.services.jobs import (
     find_job,
-    get_jobs_from_run_spec,
+    get_job_specs_from_run_spec,
     group_jobs_by_replica_latest,
 )
 from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.runs import (
-    create_job_model_for_new_submission,
     fmt,
     process_terminating_run,
     retry_run_replica_jobs,
     run_model_to_run,
     scale_run_replicas,
 )
+from dstack._internal.server.services.services import update_service_desired_replica_count
 from dstack._internal.utils import common
 from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
+ROLLING_DEPLOYMENT_MAX_SURGE = 1  # at most one extra replica during rolling deployment
 async def process_runs(batch_size: int = 1):
@@ -133,46 +133,22 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
         logger.debug("%s: pending run is not yet ready for resubmission", fmt(run_model))
         return
-    # TODO(egor-s) consolidate with `scale_run_replicas` if possible
-    replicas = 1
+    run_model.desired_replica_count = 1
     if run.run_spec.configuration.type == "service":
-        replicas = run.run_spec.configuration.replicas.min or 0  # new default
-        scaler = autoscalers.get_service_scaler(run.run_spec.configuration)
-        stats = None
-        if run_model.gateway_id is not None:
-            conn = await gateways.get_or_add_gateway_connection(session, run_model.gateway_id)
-            stats = await conn.get_stats(run_model.project.name, run_model.run_name)
-        # replicas info doesn't matter for now
-        replicas = scaler.scale([], stats)
-    if replicas == 0:
+        run_model.desired_replica_count = run.run_spec.configuration.replicas.min or 0
+        await update_service_desired_replica_count(
+            session,
+            run_model,
+            run.run_spec.configuration,
+            # does not matter for pending services, since 0->n scaling should happen without delay
+            last_scaled_at=None,
+        )
+    if run_model.desired_replica_count == 0:
         # stay zero scaled
         return
-    scheduled_replicas = 0
-    # Resubmit existing replicas
-    for replica_num, replica_jobs in itertools.groupby(
-        run.jobs, key=lambda j: j.job_spec.replica_num
-    ):
-        if scheduled_replicas >= replicas:
-            break
-        scheduled_replicas += 1
-        for job in replica_jobs:
-            new_job_model = create_job_model_for_new_submission(
-                run_model=run_model,
-                job=job,
-                status=JobStatus.SUBMITTED,
-            )
-            session.add(new_job_model)
-    # Create missing replicas
-    for replica_num in range(scheduled_replicas, replicas):
-        jobs = await get_jobs_from_run_spec(run.run_spec, replica_num=replica_num)
-        for job in jobs:
-            job_model = create_job_model_for_new_submission(
-                run_model=run_model,
-                job=job,
-                status=JobStatus.SUBMITTED,
-            )
-            session.add(job_model)
+    await scale_run_replicas(session, run_model, replicas_diff=run_model.desired_replica_count)
     run_model.status = RunStatus.SUBMITTED
     logger.info("%s: run status has changed PENDING -> SUBMITTED", fmt(run_model))
@@ -313,6 +289,10 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             termination_reason = RunTerminationReason.RETRY_LIMIT_EXCEEDED
         else:
             raise ValueError(f"Unexpected termination reason {run_termination_reasons}")
+    elif _should_stop_on_master_done(run):
+        new_status = RunStatus.TERMINATING
+        # ALL_JOBS_DONE is used for all DONE reasons including master-done
+        termination_reason = RunTerminationReason.ALL_JOBS_DONE
     elif RunStatus.RUNNING in run_statuses:
         new_status = RunStatus.RUNNING
     elif RunStatus.PROVISIONING in run_statuses:
@@ -336,27 +316,11 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
                     job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
     if new_status not in {RunStatus.TERMINATING, RunStatus.PENDING}:
-        # No need to retry if the run is terminating,
+        # No need to retry, scale, or redeploy replicas if the run is terminating,
         # pending run will retry replicas in `process_pending_run`
-        for _, replica_jobs in replicas_to_retry:
-            await retry_run_replica_jobs(
-                session, run_model, replica_jobs, only_failed=retry_single_job
-            )
-        if run_spec.configuration.type == "service":
-            scaler = autoscalers.get_service_scaler(run_spec.configuration)
-            stats = None
-            if run_model.gateway_id is not None:
-                conn = await gateways.get_or_add_gateway_connection(session, run_model.gateway_id)
-                stats = await conn.get_stats(run_model.project.name, run_model.run_name)
-            # use replicas_info from before retrying
-            replicas_diff = scaler.scale(replicas_info, stats)
-            if replicas_diff != 0:
-                # FIXME: potentially long write transaction
-                # Why do we flush here?
-                await session.flush()
-                await session.refresh(run_model)
-                await scale_run_replicas(session, run_model, replicas_diff)
+        await _handle_run_replicas(
+            session, run_model, run_spec, replicas_to_retry, retry_single_job, replicas_info
+        )
     if run_model.status != new_status:
         logger.info(
@@ -374,6 +338,130 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             run_model.resubmission_attempt += 1
+async def _handle_run_replicas(
+    session: AsyncSession,
+    run_model: RunModel,
+    run_spec: RunSpec,
+    replicas_to_retry: list[tuple[int, list[JobModel]]],
+    retry_single_job: bool,
+    replicas_info: list[autoscalers.ReplicaInfo],
+) -> None:
+    """
+    Does ONE of:
+    - replica retry
+    - replica scaling
+    - replica rolling deployment
+    Does not do everything at once to avoid conflicts between the stages and long DB transactions.
+    """
+    if replicas_to_retry:
+        for _, replica_jobs in replicas_to_retry:
+            await retry_run_replica_jobs(
+                session, run_model, replica_jobs, only_failed=retry_single_job
+            )
+        return
+    if run_spec.configuration.type == "service":
+        await update_service_desired_replica_count(
+            session,
+            run_model,
+            run_spec.configuration,
+            # FIXME: should only include scaling events, not retries and deployments
+            last_scaled_at=max((r.timestamp for r in replicas_info), default=None),
+        )
+    max_replica_count = run_model.desired_replica_count
+    if _has_out_of_date_replicas(run_model):
+        # allow extra replicas when deployment is in progress
+        max_replica_count += ROLLING_DEPLOYMENT_MAX_SURGE
+    active_replica_count = sum(1 for r in replicas_info if r.active)
+    if active_replica_count not in range(run_model.desired_replica_count, max_replica_count + 1):
+        await scale_run_replicas(
+            session,
+            run_model,
+            replicas_diff=run_model.desired_replica_count - active_replica_count,
+        )
+        return
+    await _update_jobs_to_new_deployment_in_place(run_model, run_spec)
+    if _has_out_of_date_replicas(run_model):
+        non_terminated_replica_count = len(
+            {j.replica_num for j in run_model.jobs if not j.status.is_finished()}
+        )
+        # Avoid using too much hardware during a deployment - never have
+        # more than max_replica_count non-terminated replicas.
+        if non_terminated_replica_count < max_replica_count:
+            # Start more up-to-date replicas that will eventually replace out-of-date replicas.
+            await scale_run_replicas(
+                session,
+                run_model,
+                replicas_diff=max_replica_count - non_terminated_replica_count,
+            )
+        replicas_to_stop_count = 0
+        # stop any out-of-date replicas that are not running
+        replicas_to_stop_count += len(
+            {
+                j.replica_num
+                for j in run_model.jobs
+                if j.status
+                not in [JobStatus.RUNNING, JobStatus.TERMINATING] + JobStatus.finished_statuses()
+                and j.deployment_num < run_model.deployment_num
+            }
+        )
+        running_replica_count = len(
+            {j.replica_num for j in run_model.jobs if j.status == JobStatus.RUNNING}
+        )
+        if running_replica_count > run_model.desired_replica_count:
+            # stop excessive running out-of-date replicas
+            replicas_to_stop_count += running_replica_count - run_model.desired_replica_count
+        if replicas_to_stop_count:
+            await scale_run_replicas(
+                session,
+                run_model,
+                replicas_diff=-replicas_to_stop_count,
+            )
+async def _update_jobs_to_new_deployment_in_place(run_model: RunModel, run_spec: RunSpec) -> None:
+    """
+    Bump deployment_num for jobs that do not require redeployment.
+    """
+    for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
+        if all(j.status.is_finished() for j in job_models):
+            continue
+        if all(j.deployment_num == run_model.deployment_num for j in job_models):
+            continue
+        new_job_specs = await get_job_specs_from_run_spec(
+            run_spec=run_spec,
+            replica_num=replica_num,
+        )
+        assert len(new_job_specs) == len(job_models), (
+            "Changing the number of jobs within a replica is not yet supported"
+        )
+        can_update_all_jobs = True
+        for old_job_model, new_job_spec in zip(job_models, new_job_specs):
+            old_job_spec = JobSpec.__response__.parse_raw(old_job_model.job_spec_data)
+            if new_job_spec != old_job_spec:
+                can_update_all_jobs = False
+                break
+        if can_update_all_jobs:
+            for job_model in job_models:
+                job_model.deployment_num = run_model.deployment_num
+def _has_out_of_date_replicas(run: RunModel) -> bool:
+    for job in run.jobs:
+        if job.deployment_num < run.deployment_num and not (
+            job.status.is_finished() or job.termination_reason == JobTerminationReason.SCALED_DOWN
+        ):
+            return True
+    return False
 def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datetime.timedelta]:
     """
     Checks if the job should be retried.
@@ -389,7 +477,8 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet
             break
     if (
-        job_model.termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
+        job_model.termination_reason is not None
+        and job_model.termination_reason.to_retry_event() == RetryEvent.NO_CAPACITY
         and last_provisioned_submission is None
         and RetryEvent.NO_CAPACITY in job.job_spec.retry.on_events
     ):
@@ -399,24 +488,9 @@ def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datet
         return None
     if (
-        last_provisioned_submission.termination_reason
-        == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
-        and RetryEvent.INTERRUPTION in job.job_spec.retry.on_events
-    ):
-        return common.get_current_datetime() - last_provisioned_submission.last_processed_at
-    if (
-        last_provisioned_submission.termination_reason
-        in [
-            JobTerminationReason.CONTAINER_EXITED_WITH_ERROR,
-            JobTerminationReason.CREATING_CONTAINER_ERROR,
-            JobTerminationReason.EXECUTOR_ERROR,
-            JobTerminationReason.GATEWAY_ERROR,
-            JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED,
-            JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED,
-            JobTerminationReason.PORTS_BINDING_FAILED,
-        ]
-        and RetryEvent.ERROR in job.job_spec.retry.on_events
+        last_provisioned_submission.termination_reason is not None
+        and last_provisioned_submission.termination_reason.to_retry_event()
+        in job.job_spec.retry.on_events
     ):
         return common.get_current_datetime() - last_provisioned_submission.last_processed_at
@@ -434,3 +508,12 @@ def _can_retry_single_job(run_spec: RunSpec) -> bool:
     # We could make partial retry in some multi-node cases.
     # E.g. restarting a worker node, independent jobs.
     return False
+def _should_stop_on_master_done(run: Run) -> bool:
+    if run.run_spec.merged_profile.stop_criteria != StopCriteria.MASTER_DONE:
+        return False
+    for job in run.jobs:
+        if job.job_spec.job_num == 0 and job.job_submissions[-1].status == JobStatus.DONE:
+            return True
+    return False

dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Add rolling deployment fields
+Revision ID: 35e90e1b0d3e
+Revises: 35f732ee4cf5
+Create Date: 2025-05-29 15:30:27.878569
+"""
+import sqlalchemy as sa
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "35e90e1b0d3e"
+down_revision = "35f732ee4cf5"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.execute("UPDATE jobs SET deployment_num = 0")
+        batch_op.alter_column("deployment_num", nullable=False)
+    with op.batch_alter_table("runs", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("deployment_num", sa.Integer(), nullable=True))
+        batch_op.add_column(sa.Column("desired_replica_count", sa.Integer(), nullable=True))
+    with op.batch_alter_table("runs", schema=None) as batch_op:
+        batch_op.execute("UPDATE runs SET deployment_num = 0")
+        batch_op.execute("UPDATE runs SET desired_replica_count = 1")
+        batch_op.alter_column("deployment_num", nullable=False)
+        batch_op.alter_column("desired_replica_count", nullable=False)
+def downgrade() -> None:
+    with op.batch_alter_table("runs", schema=None) as batch_op:
+        batch_op.drop_column("deployment_num")
+        batch_op.drop_column("desired_replica_count")
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.drop_column("deployment_num")

dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""Add ProjectModel.is_public
+Revision ID: 35f732ee4cf5
+Revises: bca2fdf130bf
+Create Date: 2025-06-06 13:04:02.912032
+"""
+import sqlalchemy as sa
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "35f732ee4cf5"
+down_revision = "bca2fdf130bf"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    # Add is_public column as nullable first
+    with op.batch_alter_table("projects", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("is_public", sa.Boolean(), nullable=True))
+    # Set is_public to False for existing projects
+    op.execute(sa.sql.text("UPDATE projects SET is_public = FALSE"))
+    # Make is_public non-nullable with default value
+    with op.batch_alter_table("projects", schema=None) as batch_op:
+        batch_op.alter_column("is_public", nullable=False, server_default=sa.false())
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    # Remove is_public column
+    with op.batch_alter_table("projects", schema=None) as batch_op:
+        batch_op.drop_column("is_public")
+    # ### end Alembic commands ###

dstack/_internal/server/models.py CHANGED Viewed

@@ -202,6 +202,7 @@ class ProjectModel(BaseModel):
     name: Mapped[str] = mapped_column(String(50), unique=True)
     created_at: Mapped[datetime] = mapped_column(NaiveDateTime, default=get_current_datetime)
     deleted: Mapped[bool] = mapped_column(Boolean, default=False)
+    is_public: Mapped[bool] = mapped_column(Boolean, default=False)
     owner_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("users.id", ondelete="CASCADE"))
     owner: Mapped[UserModel] = relationship(lazy="joined")
@@ -349,6 +350,8 @@ class RunModel(BaseModel):
     run_spec: Mapped[str] = mapped_column(Text)
     service_spec: Mapped[Optional[str]] = mapped_column(Text)
     priority: Mapped[int] = mapped_column(Integer, default=0)
+    deployment_num: Mapped[int] = mapped_column(Integer)
+    desired_replica_count: Mapped[int] = mapped_column(Integer)
     jobs: Mapped[List["JobModel"]] = relationship(
         back_populates="run", lazy="selectin", order_by="[JobModel.replica_num, JobModel.job_num]"
@@ -403,6 +406,7 @@ class JobModel(BaseModel):
     instance: Mapped[Optional["InstanceModel"]] = relationship(back_populates="jobs")
     used_instance_id: Mapped[Optional[uuid.UUID]] = mapped_column(UUIDType(binary=False))
     replica_num: Mapped[int] = mapped_column(Integer)
+    deployment_num: Mapped[int] = mapped_column(Integer)
     job_runtime_data: Mapped[Optional[str]] = mapped_column(Text)

dstack 0.19.12rc1__py3-none-any.whl → 0.19.14__py3-none-any.whl

Potentially problematic release.

dstack 0.19.12rc1py3-none-any.whl → 0.19.14py3-none-any.whl