PyPI - dstack - Versions diffs - 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl - Mend

dstack 0.19.20py3-none-any.whl → 0.19.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show

dstack/_internal/server/background/tasks/process_prometheus_metrics.py CHANGED Viewed

@@ -9,11 +9,17 @@ from sqlalchemy.orm import joinedload
 from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
 from dstack._internal.core.models.runs import JobStatus
 from dstack._internal.server.db import get_session_ctx
-from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
+from dstack._internal.server.models import (
+    InstanceModel,
+    JobModel,
+    JobPrometheusMetrics,
+    ProjectModel,
+)
 from dstack._internal.server.services.instances import get_instance_ssh_private_keys
 from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
 from dstack._internal.server.services.runner import client
 from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
+from dstack._internal.server.utils import sentry_utils
 from dstack._internal.server.utils.common import gather_map_async
 from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
 from dstack._internal.utils.logging import get_logger
@@ -29,6 +35,7 @@ MIN_COLLECT_INTERVAL_SECONDS = 9
 METRICS_TTL_SECONDS = 600
+@sentry_utils.instrument_background_task
 async def collect_prometheus_metrics():
     now = get_current_datetime()
     cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
@@ -43,7 +50,11 @@ async def collect_prometheus_metrics():
                     JobPrometheusMetrics.collected_at < cutoff,
                 ),
             )
-            .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
+            .options(
+                joinedload(JobModel.instance)
+                .joinedload(InstanceModel.project)
+                .load_only(ProjectModel.ssh_private_key)
+            )
             .order_by(JobModel.last_processed_at.asc())
             .limit(MAX_JOBS_FETCHED)
         )
@@ -52,6 +63,7 @@ async def collect_prometheus_metrics():
         await _collect_jobs_metrics(batch, now)
+@sentry_utils.instrument_background_task
 async def delete_prometheus_metrics():
     now = get_current_datetime()
     cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -2,12 +2,12 @@ import asyncio
 import re
 import uuid
 from collections.abc import Iterable
-from datetime import timedelta, timezone
+from datetime import timedelta
 from typing import Dict, List, Optional
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import joinedload
+from sqlalchemy.orm import joinedload, load_only
 from dstack._internal import settings
 from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HTTP_PORT
@@ -42,6 +42,7 @@ from dstack._internal.server.db import get_db, get_session_ctx
 from dstack._internal.server.models import (
     InstanceModel,
     JobModel,
+    ProbeModel,
     ProjectModel,
     RepoModel,
     RunModel,
@@ -73,6 +74,7 @@ from dstack._internal.server.services.runs import (
 )
 from dstack._internal.server.services.secrets import get_project_secrets_mapping
 from dstack._internal.server.services.storage import get_default_storage
+from dstack._internal.server.utils import sentry_utils
 from dstack._internal.utils import common as common_utils
 from dstack._internal.utils.interpolator import InterpolatorError, VariablesInterpolator
 from dstack._internal.utils.logging import get_logger
@@ -94,6 +96,7 @@ async def process_running_jobs(batch_size: int = 1):
     await asyncio.gather(*tasks)
+@sentry_utils.instrument_background_task
 async def _process_next_running_job():
     lock, lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
     async with get_session_ctx() as session:
@@ -108,9 +111,9 @@ async def _process_next_running_job():
                     RunModel.status.not_in([RunStatus.TERMINATING]),
                     JobModel.id.not_in(lockset),
                     JobModel.last_processed_at
-                    < common_utils.get_current_datetime().replace(tzinfo=None)
-                    - MIN_PROCESSING_INTERVAL,
+                    < common_utils.get_current_datetime() - MIN_PROCESSING_INTERVAL,
                 )
+                .options(load_only(JobModel.id))
                 .order_by(JobModel.last_processed_at.asc())
                 .limit(1)
                 .with_for_update(
@@ -133,7 +136,6 @@ async def _process_next_running_job():
 async def _process_running_job(session: AsyncSession, job_model: JobModel):
     # Refetch to load related attributes.
-    # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
     res = await session.execute(
         select(JobModel)
         .where(JobModel.id == job_model.id)
@@ -144,7 +146,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
     res = await session.execute(
         select(RunModel)
         .where(RunModel.id == job_model.run_id)
-        .options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
+        .options(joinedload(RunModel.project))
         .options(joinedload(RunModel.user))
         .options(joinedload(RunModel.repo))
         .options(joinedload(RunModel.jobs))
@@ -160,143 +162,147 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         job_model.status = JobStatus.TERMINATING
         job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
         job_model.last_processed_at = common_utils.get_current_datetime()
+        await session.commit()
         return
     job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
-    # Wait until all other jobs in the replica are provisioned
-    for other_job in run.jobs:
-        if (
-            other_job.job_spec.replica_num == job.job_spec.replica_num
-            and other_job.job_submissions[-1].status == JobStatus.SUBMITTED
-        ):
+    initial_status = job_model.status
+    if initial_status in [JobStatus.PROVISIONING, JobStatus.PULLING]:
+        # Wait until all other jobs in the replica are provisioned
+        for other_job in run.jobs:
+            if (
+                other_job.job_spec.replica_num == job.job_spec.replica_num
+                and other_job.job_submissions[-1].status == JobStatus.SUBMITTED
+            ):
+                job_model.last_processed_at = common_utils.get_current_datetime()
+                await session.commit()
+                return
+        cluster_info = _get_cluster_info(
+            jobs=run.jobs,
+            replica_num=job.job_spec.replica_num,
+            job_provisioning_data=job_provisioning_data,
+            job_runtime_data=job_submission.job_runtime_data,
+        )
+        volumes = await get_job_attached_volumes(
+            session=session,
+            project=project,
+            run_spec=run.run_spec,
+            job_num=job.job_spec.job_num,
+            job_provisioning_data=job_provisioning_data,
+        )
+        repo_creds_model = await get_repo_creds(
+            session=session, repo=repo_model, user=run_model.user
+        )
+        repo_creds = repo_model_to_repo_head_with_creds(repo_model, repo_creds_model).repo_creds
+        secrets = await get_project_secrets_mapping(session=session, project=project)
+        try:
+            _interpolate_secrets(secrets, job.job_spec)
+        except InterpolatorError as e:
+            logger.info("%s: terminating due to secrets interpolation error", fmt(job_model))
+            job_model.status = JobStatus.TERMINATING
+            job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
+            job_model.termination_reason_message = e.args[0]
             job_model.last_processed_at = common_utils.get_current_datetime()
             await session.commit()
             return
-    cluster_info = _get_cluster_info(
-        jobs=run.jobs,
-        replica_num=job.job_spec.replica_num,
-        job_provisioning_data=job_provisioning_data,
-        job_runtime_data=job_submission.job_runtime_data,
-    )
-    volumes = await get_job_attached_volumes(
-        session=session,
-        project=project,
-        run_spec=run.run_spec,
-        job_num=job.job_spec.job_num,
-        job_provisioning_data=job_provisioning_data,
-    )
     server_ssh_private_keys = get_instance_ssh_private_keys(
         common_utils.get_or_error(job_model.instance)
     )
-    secrets = await get_project_secrets_mapping(session=session, project=project)
-    try:
-        _interpolate_secrets(secrets, job.job_spec)
-    except InterpolatorError as e:
-        logger.info("%s: terminating due to secrets interpolation error", fmt(job_model))
-        job_model.status = JobStatus.TERMINATING
-        job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
-        job_model.termination_reason_message = e.args[0]
-        job_model.last_processed_at = common_utils.get_current_datetime()
-        return
-    repo_creds_model = await get_repo_creds(session=session, repo=repo_model, user=run_model.user)
-    repo_creds = repo_model_to_repo_head_with_creds(repo_model, repo_creds_model).repo_creds
-    initial_status = job_model.status
     if initial_status == JobStatus.PROVISIONING:
         if job_provisioning_data.hostname is None:
             await _wait_for_instance_provisioning_data(job_model=job_model)
+            job_model.last_processed_at = common_utils.get_current_datetime()
+            await session.commit()
+            return
+        if _should_wait_for_other_nodes(run, job, job_model):
+            job_model.last_processed_at = common_utils.get_current_datetime()
+            await session.commit()
+            return
+        # fails are acceptable until timeout is exceeded
+        if job_provisioning_data.dockerized:
+            logger.debug(
+                "%s: process provisioning job with shim, age=%s",
+                fmt(job_model),
+                job_submission.age,
+            )
+            ssh_user = job_provisioning_data.username
+            user_ssh_key = run.run_spec.ssh_key_pub.strip()
+            public_keys = [project.ssh_public_key.strip(), user_ssh_key]
+            if job_provisioning_data.backend == BackendType.LOCAL:
+                # No need to update ~/.ssh/authorized_keys when running shim locally
+                user_ssh_key = ""
+            success = await common_utils.run_async(
+                _process_provisioning_with_shim,
+                server_ssh_private_keys,
+                job_provisioning_data,
+                None,
+                run,
+                job_model,
+                job_provisioning_data,
+                volumes,
+                job.job_spec.registry_auth,
+                public_keys,
+                ssh_user,
+                user_ssh_key,
+            )
         else:
-            if _should_wait_for_other_nodes(run, job, job_model):
-                job_model.last_processed_at = common_utils.get_current_datetime()
-                await session.commit()
-                return
+            logger.debug(
+                "%s: process provisioning job without shim, age=%s",
+                fmt(job_model),
+                job_submission.age,
+            )
+            # FIXME: downloading file archives and code here is a waste of time if
+            # the runner is not ready yet
+            file_archives = await _get_job_file_archives(
+                session=session,
+                archive_mappings=job.job_spec.file_archives,
+                user=run_model.user,
+            )
+            code = await _get_job_code(
+                session=session,
+                project=project,
+                repo=repo_model,
+                code_hash=_get_repo_code_hash(run, job),
+            )
-            # fails are acceptable until timeout is exceeded
-            if job_provisioning_data.dockerized:
-                logger.debug(
-                    "%s: process provisioning job with shim, age=%s",
-                    fmt(job_model),
-                    job_submission.age,
-                )
-                ssh_user = job_provisioning_data.username
-                user_ssh_key = run.run_spec.ssh_key_pub.strip()
-                public_keys = [project.ssh_public_key.strip(), user_ssh_key]
-                if job_provisioning_data.backend == BackendType.LOCAL:
-                    # No need to update ~/.ssh/authorized_keys when running shim locally
-                    user_ssh_key = ""
-                success = await common_utils.run_async(
-                    _process_provisioning_with_shim,
-                    server_ssh_private_keys,
-                    job_provisioning_data,
-                    None,
-                    run,
-                    job_model,
-                    job_provisioning_data,
-                    volumes,
-                    job.job_spec.registry_auth,
-                    public_keys,
-                    ssh_user,
-                    user_ssh_key,
-                )
-            else:
-                logger.debug(
-                    "%s: process provisioning job without shim, age=%s",
+            success = await common_utils.run_async(
+                _submit_job_to_runner,
+                server_ssh_private_keys,
+                job_provisioning_data,
+                None,
+                run,
+                job_model,
+                job,
+                cluster_info,
+                code,
+                file_archives,
+                secrets,
+                repo_creds,
+                success_if_not_available=False,
+            )
+        if not success:
+            # check timeout
+            if job_submission.age > get_provisioning_timeout(
+                backend_type=job_provisioning_data.get_base_backend(),
+                instance_type_name=job_provisioning_data.instance_type.name,
+            ):
+                logger.warning(
+                    "%s: failed because runner has not become available in time, age=%s",
                     fmt(job_model),
                     job_submission.age,
                 )
-                # FIXME: downloading file archives and code here is a waste of time if
-                # the runner is not ready yet
-                file_archives = await _get_job_file_archives(
-                    session=session,
-                    archive_mappings=job.job_spec.file_archives,
-                    user=run_model.user,
-                )
-                code = await _get_job_code(
-                    session=session,
-                    project=project,
-                    repo=repo_model,
-                    code_hash=_get_repo_code_hash(run, job),
-                )
-                success = await common_utils.run_async(
-                    _submit_job_to_runner,
-                    server_ssh_private_keys,
-                    job_provisioning_data,
-                    None,
-                    run,
-                    job_model,
-                    job,
-                    cluster_info,
-                    code,
-                    file_archives,
-                    secrets,
-                    repo_creds,
-                    success_if_not_available=False,
-                )
-            if not success:
-                # check timeout
-                if job_submission.age > get_provisioning_timeout(
-                    backend_type=job_provisioning_data.get_base_backend(),
-                    instance_type_name=job_provisioning_data.instance_type.name,
-                ):
-                    logger.warning(
-                        "%s: failed because runner has not become available in time, age=%s",
-                        fmt(job_model),
-                        job_submission.age,
-                    )
-                    job_model.status = JobStatus.TERMINATING
-                    job_model.termination_reason = (
-                        JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED
-                    )
-                    # instance will be emptied by process_terminating_jobs
+                job_model.status = JobStatus.TERMINATING
+                job_model.termination_reason = JobTerminationReason.WAITING_RUNNER_LIMIT_EXCEEDED
+                # instance will be emptied by process_terminating_jobs
     else:  # fails are not acceptable
         if initial_status == JobStatus.PULLING:
@@ -409,6 +415,18 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
             )
             job_model.status = JobStatus.TERMINATING
             job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
+        else:
+            for probe_num in range(len(job.job_spec.probes)):
+                session.add(
+                    ProbeModel(
+                        name=f"{job_model.job_name}-{probe_num}",
+                        job=job_model,
+                        probe_num=probe_num,
+                        due=common_utils.get_current_datetime(),
+                        success_streak=0,
+                        active=True,
+                    )
+                )
     if job_model.status == JobStatus.RUNNING:
         await _check_gpu_utilization(session, job_model, job)
@@ -801,7 +819,7 @@ def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
         return False
     return (
         common_utils.get_current_datetime()
-        > job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
+        > job_model.disconnected_at + JOB_DISCONNECTED_RETRY_TIMEOUT
     )

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -2,9 +2,9 @@ import asyncio
 import datetime
 from typing import List, Optional, Set, Tuple
-from sqlalchemy import select
+from sqlalchemy import and_, or_, select
 from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import joinedload, selectinload
+from sqlalchemy.orm import joinedload, load_only, selectinload
 import dstack._internal.server.services.services.autoscalers as autoscalers
 from dstack._internal.core.errors import ServerError
@@ -20,7 +20,14 @@ from dstack._internal.core.models.runs import (
     RunTerminationReason,
 )
 from dstack._internal.server.db import get_db, get_session_ctx
-from dstack._internal.server.models import JobModel, ProjectModel, RunModel
+from dstack._internal.server.models import (
+    InstanceModel,
+    JobModel,
+    ProbeModel,
+    ProjectModel,
+    RunModel,
+    UserModel,
+)
 from dstack._internal.server.services.jobs import (
     find_job,
     get_job_specs_from_run_spec,
@@ -30,6 +37,7 @@ from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.prometheus.client_metrics import run_metrics
 from dstack._internal.server.services.runs import (
     fmt,
+    is_replica_ready,
     process_terminating_run,
     retry_run_replica_jobs,
     run_model_to_run,
@@ -37,6 +45,7 @@ from dstack._internal.server.services.runs import (
 )
 from dstack._internal.server.services.secrets import get_project_secrets_mapping
 from dstack._internal.server.services.services import update_service_desired_replica_count
+from dstack._internal.server.utils import sentry_utils
 from dstack._internal.utils import common
 from dstack._internal.utils.logging import get_logger
@@ -53,22 +62,54 @@ async def process_runs(batch_size: int = 1):
     await asyncio.gather(*tasks)
+@sentry_utils.instrument_background_task
 async def _process_next_run():
     run_lock, run_lockset = get_locker(get_db().dialect_name).get_lockset(RunModel.__tablename__)
     job_lock, job_lockset = get_locker(get_db().dialect_name).get_lockset(JobModel.__tablename__)
+    now = common.get_current_datetime()
     async with get_session_ctx() as session:
         async with run_lock, job_lock:
             res = await session.execute(
                 select(RunModel)
                 .where(
-                    RunModel.status.not_in(RunStatus.finished_statuses()),
                     RunModel.id.not_in(run_lockset),
-                    RunModel.last_processed_at
-                    < common.get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
+                    RunModel.last_processed_at < now - MIN_PROCESSING_INTERVAL,
+                    # Filter out runs that don't need to be processed.
+                    # This is only to reduce unnecessary commits.
+                    # Otherwise, we could fetch all active runs and filter them when processing.
+                    or_(
+                        # Active non-pending runs:
+                        RunModel.status.not_in(
+                            RunStatus.finished_statuses() + [RunStatus.PENDING]
+                        ),
+                        # Retrying runs:
+                        and_(
+                            RunModel.status == RunStatus.PENDING,
+                            RunModel.resubmission_attempt > 0,
+                        ),
+                        # Scheduled ready runs:
+                        and_(
+                            RunModel.status == RunStatus.PENDING,
+                            RunModel.resubmission_attempt == 0,
+                            RunModel.next_triggered_at.is_not(None),
+                            RunModel.next_triggered_at < now,
+                        ),
+                        # Scaled-to-zero runs:
+                        # Such runs cannot be scheduled, thus we check next_triggered_at.
+                        # If we allow scheduled services with downscaling to zero
+                        # This check won't pass.
+                        and_(
+                            RunModel.status == RunStatus.PENDING,
+                            RunModel.resubmission_attempt == 0,
+                            RunModel.next_triggered_at.is_(None),
+                        ),
+                    ),
                 )
+                .options(joinedload(RunModel.jobs).load_only(JobModel.id))
+                .options(load_only(RunModel.id))
                 .order_by(RunModel.last_processed_at.asc())
                 .limit(1)
-                .with_for_update(skip_locked=True, key_share=True)
+                .with_for_update(skip_locked=True, key_share=True, of=RunModel)
             )
             run_model = res.scalar()
             if run_model is None:
@@ -98,20 +139,27 @@ async def _process_next_run():
 async def _process_run(session: AsyncSession, run_model: RunModel):
-    logger.debug("%s: processing run", fmt(run_model))
     # Refetch to load related attributes.
-    # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
     res = await session.execute(
         select(RunModel)
         .where(RunModel.id == run_model.id)
         .execution_options(populate_existing=True)
-        .options(joinedload(RunModel.project).joinedload(ProjectModel.backends))
-        .options(joinedload(RunModel.user))
-        .options(joinedload(RunModel.repo))
-        .options(selectinload(RunModel.jobs).joinedload(JobModel.instance))
+        .options(joinedload(RunModel.project).load_only(ProjectModel.id, ProjectModel.name))
+        .options(joinedload(RunModel.user).load_only(UserModel.name))
+        .options(
+            selectinload(RunModel.jobs)
+            .joinedload(JobModel.instance)
+            .load_only(InstanceModel.fleet_id)
+        )
+        .options(
+            selectinload(RunModel.jobs)
+            .joinedload(JobModel.probes)
+            .load_only(ProbeModel.success_streak)
+        )
         .execution_options(populate_existing=True)
     )
     run_model = res.unique().scalar_one()
+    logger.debug("%s: processing run", fmt(run_model))
     try:
         if run_model.status == RunStatus.PENDING:
             await _process_pending_run(session, run_model)
@@ -135,8 +183,12 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
 async def _process_pending_run(session: AsyncSession, run_model: RunModel):
     """Jobs are not created yet"""
     run = run_model_to_run(run_model)
-    if not _pending_run_ready_for_resubmission(run_model, run):
-        logger.debug("%s: pending run is not yet ready for resubmission", fmt(run_model))
+    # TODO: Do not select such runs in the first place to avoid redundant processing
+    if run_model.resubmission_attempt > 0 and not _retrying_run_ready_for_resubmission(
+        run_model, run
+    ):
+        logger.debug("%s: retrying run is not yet ready for resubmission", fmt(run_model))
         return
     run_model.desired_replica_count = 1
@@ -160,7 +212,7 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
     logger.info("%s: run status has changed PENDING -> SUBMITTED", fmt(run_model))
-def _pending_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
+def _retrying_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
     if run.latest_job_submission is None:
         # Should not be possible
         return True
@@ -197,7 +249,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
     We handle fails, scaling, and status changes.
     """
     run = run_model_to_run(run_model)
-    run_spec = RunSpec.__response__.parse_raw(run_model.run_spec)
+    run_spec = run.run_spec
     retry_single_job = _can_retry_single_job(run_spec)
     run_statuses: Set[RunStatus] = set()
@@ -337,9 +389,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
         )
         if run_model.status == RunStatus.SUBMITTED and new_status == RunStatus.PROVISIONING:
             current_time = common.get_current_datetime()
-            submit_to_provision_duration = (
-                current_time - run_model.submitted_at.replace(tzinfo=datetime.timezone.utc)
-            ).total_seconds()
+            submit_to_provision_duration = (current_time - run_model.submitted_at).total_seconds()
             logger.info(
                 "%s: run took %.2f seconds from submission to provisioning.",
                 fmt(run_model),
@@ -429,22 +479,22 @@ async def _handle_run_replicas(
             )
         replicas_to_stop_count = 0
-        # stop any out-of-date replicas that are not running
-        replicas_to_stop_count += len(
-            {
-                j.replica_num
-                for j in run_model.jobs
-                if j.status
-                not in [JobStatus.RUNNING, JobStatus.TERMINATING] + JobStatus.finished_statuses()
-                and j.deployment_num < run_model.deployment_num
-            }
+        # stop any out-of-date replicas that are not ready
+        replicas_to_stop_count += sum(
+            any(j.deployment_num < run_model.deployment_num for j in jobs)
+            and any(
+                j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses()
+                for j in jobs
+            )
+            and not is_replica_ready(jobs)
+            for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
         )
-        running_replica_count = len(
-            {j.replica_num for j in run_model.jobs if j.status == JobStatus.RUNNING}
+        ready_replica_count = sum(
+            is_replica_ready(jobs) for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
         )
-        if running_replica_count > run_model.desired_replica_count:
-            # stop excessive running out-of-date replicas
-            replicas_to_stop_count += running_replica_count - run_model.desired_replica_count
+        if ready_replica_count > run_model.desired_replica_count:
+            # stop excessive ready out-of-date replicas
+            replicas_to_stop_count += ready_replica_count - run_model.desired_replica_count
         if replicas_to_stop_count:
             await scale_run_replicas(
                 session,

dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl

Potentially problematic release.

dstack 0.19.20py3-none-any.whl → 0.19.22py3-none-any.whl