PyPI - dstack - Versions diffs - 0.18.43__py3-none-any.whl → 0.18.44__py3-none-any.whl - Mend

dstack 0.18.43py3-none-any.whl → 0.18.44py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (59) hide show

dstack/_internal/cli/services/configurators/run.py CHANGED Viewed

@@ -95,6 +95,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
                 reservation=profile.reservation,
                 spot_policy=profile.spot_policy,
                 retry_policy=profile.retry_policy,
+                utilization_policy=profile.utilization_policy,
                 max_duration=profile.max_duration,
                 stop_duration=profile.stop_duration,
                 max_price=profile.max_price,

dstack/_internal/cli/utils/run.py CHANGED Viewed

@@ -4,6 +4,8 @@ from rich.markup import escape
 from rich.table import Table
 from dstack._internal.cli.utils.common import NO_OFFERS_WARNING, add_row_from_dict, console
+from dstack._internal.core.models.common import is_core_model_instance
+from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
 from dstack._internal.core.models.instances import InstanceAvailability
 from dstack._internal.core.models.profiles import (
     DEFAULT_RUN_TERMINATION_IDLE_TIME,
@@ -38,6 +40,13 @@ def print_run_plan(run_plan: RunPlan, offers_limit: int = 3):
         if job_plan.job_spec.max_duration
         else "-"
     )
+    inactivity_duration = None
+    if is_core_model_instance(run_plan.run_spec.configuration, DevEnvironmentConfiguration):
+        inactivity_duration = "-"
+        if isinstance(run_plan.run_spec.configuration.inactivity_duration, int):
+            inactivity_duration = format_pretty_duration(
+                run_plan.run_spec.configuration.inactivity_duration
+            )
     if job_plan.job_spec.retry is None:
         retry = "-"
     else:
@@ -72,6 +81,8 @@ def print_run_plan(run_plan: RunPlan, offers_limit: int = 3):
     props.add_row(th("Resources"), pretty_req)
     props.add_row(th("Max price"), max_price)
     props.add_row(th("Max duration"), max_duration)
+    if inactivity_duration is not None:  # None means n/a
+        props.add_row(th("Inactivity duration"), inactivity_duration)
     props.add_row(th("Spot policy"), spot_policy)
     props.add_row(th("Retry policy"), retry)
     props.add_row(th("Creation policy"), creation_policy)

dstack/_internal/core/backends/aws/compute.py CHANGED Viewed

@@ -506,6 +506,7 @@ class AWSCompute(Compute):
                 "Failed to terminate all gateway %s resources. backend_data parsing error.",
                 configuration.instance_name,
             )
+            return
         elb_client = self.session.client("elbv2", region_name=configuration.region)

dstack/_internal/core/backends/azure/compute.py CHANGED Viewed

@@ -312,7 +312,7 @@ def get_resource_group_network_subnet_or_error(
         except Exception:
             raise ComputeError(
                 "Network specified in incorrect format."
-                " Supported format for `vps_ids` values: 'networkResourceGroupName/networkName'"
+                " Supported format for `vpc_ids` values: 'networkResourceGroupName/networkName'"
             )
     elif resource_group is not None:
         network_name = azure_utils.get_default_network_name(resource_group, location)

dstack/_internal/core/backends/gcp/compute.py CHANGED Viewed

@@ -580,7 +580,7 @@ class GCPCompute(Compute):
             operation = self.disk_client.delete(
                 project=self.config.project_id,
                 zone=get_or_error(volume.provisioning_data).availability_zone,
-                disk=volume.name,
+                disk=volume.volume_id,
             )
             gcp_resources.wait_for_extended_operation(operation, "persistent disk deletion")
         except google.api_core.exceptions.NotFound:

dstack/_internal/core/backends/runpod/compute.py CHANGED Viewed

@@ -52,8 +52,9 @@ class RunpodCompute(Compute):
     ) -> List[InstanceOfferWithAvailability]:
         offers = get_catalog_offers(
             backend=BackendType.RUNPOD,
-            locations=self.config.regions,
+            locations=self.config.regions or None,
             requirements=requirements,
+            extra_filter=lambda o: _is_secure_cloud(o.region) or self.config.allow_community_cloud,
         )
         offers = [
             InstanceOfferWithAvailability(
@@ -102,13 +103,22 @@ class RunpodCompute(Compute):
         bid_per_gpu = None
         if instance_offer.instance.resources.spot and gpu_count:
             bid_per_gpu = instance_offer.price / gpu_count
+        if _is_secure_cloud(instance_offer.region):
+            cloud_type = "SECURE"
+            data_center_id = instance_offer.region
+            country_code = None
+        else:
+            cloud_type = "COMMUNITY"
+            data_center_id = None
+            country_code = instance_offer.region
         resp = self.api_client.create_pod(
             name=pod_name,
             image_name=job.job_spec.image_name,
             gpu_type_id=instance_offer.instance.name,
-            cloud_type="SECURE",  # ["ALL", "COMMUNITY", "SECURE"]:
-            data_center_id=instance_offer.region,
+            cloud_type=cloud_type,
+            data_center_id=data_center_id,
+            country_code=country_code,
             gpu_count=gpu_count,
             container_disk_in_gb=disk_size,
             min_vcpu_count=instance_offer.instance.resources.cpus,
@@ -257,3 +267,11 @@ def _get_volume_price(size: int) -> float:
     if size < 1000:
         return 0.07 * size
     return 0.05 * size
+def _is_secure_cloud(region: str) -> str:
+    """
+    Secure cloud regions are datacenter IDs: CA-MTL-1, EU-NL-1, etc.
+    Community cloud regions are country codes: CA, NL, etc.
+    """
+    return "-" in region

dstack/_internal/core/backends/runpod/config.py CHANGED Viewed

@@ -4,6 +4,14 @@ from dstack._internal.core.models.backends.runpod import (
     RunpodStoredConfig,
 )
+RUNPOD_COMMUNITY_CLOUD_DEFAULT = True
 class RunpodConfig(RunpodStoredConfig, BackendConfig):
     creds: AnyRunpodCreds
+    @property
+    def allow_community_cloud(self) -> bool:
+        if self.community_cloud is not None:
+            return self.community_cloud
+        return RUNPOD_COMMUNITY_CLOUD_DEFAULT

dstack/_internal/core/models/backends/runpod.py CHANGED Viewed

@@ -10,6 +10,7 @@ from dstack._internal.core.models.common import CoreModel
 class RunpodConfigInfo(CoreModel):
     type: Literal["runpod"] = "runpod"
     regions: Optional[List[str]] = None
+    community_cloud: Optional[bool] = None
 class RunpodStoredConfig(RunpodConfigInfo):
@@ -33,6 +34,7 @@ class RunpodConfigInfoWithCredsPartial(CoreModel):
     type: Literal["runpod"] = "runpod"
     creds: Optional[AnyRunpodCreds]
     regions: Optional[List[str]]
+    community_cloud: Optional[bool]
 class RunpodConfigValues(CoreModel):

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -221,7 +221,8 @@ class DevEnvironmentConfigurationParams(CoreModel):
                 " Inactivity is defined as the absence of SSH connections to the"
                 " dev environment, including VS Code connections, `ssh <run name>`"
                 " shells, and attached `dstack apply` or `dstack attach` commands."
-                " Use `off` for unlimited duration. Defaults to `off`"
+                " Use `off` for unlimited duration. Can be updated in-place."
+                " Defaults to `off`"
             )
         ),
     ]

dstack/_internal/core/models/profiles.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from enum import Enum
-from typing import List, Optional, Union
+from typing import List, Optional, Union, overload
 from pydantic import Field, root_validator, validator
 from typing_extensions import Annotated, Literal
@@ -34,6 +34,14 @@ class TerminationPolicy(str, Enum):
     DESTROY_AFTER_IDLE = "destroy-after-idle"
+@overload
+def parse_duration(v: None) -> None: ...
+@overload
+def parse_duration(v: Union[int, str]) -> int: ...
 def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
     if v is None:
         return None
@@ -112,6 +120,39 @@ class ProfileRetry(CoreModel):
         return values
+class UtilizationPolicy(CoreModel):
+    _min_time_window = "5m"
+    min_gpu_utilization: Annotated[
+        int,
+        Field(
+            description=(
+                "Minimum required GPU utilization, percent."
+                " If any GPU has utilization below specified value during the whole time window,"
+                " the run is terminated"
+            ),
+            ge=0,
+            le=100,
+        ),
+    ]
+    time_window: Annotated[
+        Union[int, str],
+        Field(
+            description=(
+                "The time window of metric samples taking into account to measure utilization"
+                f" (e.g., `30m`, `1h`). Minimum is `{_min_time_window}`"
+            )
+        ),
+    ]
+    @validator("time_window", pre=True)
+    def validate_time_window(cls, v: Union[int, str]) -> int:
+        v = parse_duration(v)
+        if v < parse_duration(cls._min_time_window):
+            raise ValueError(f"Minimum time_window is {cls._min_time_window}")
+        return v
 class ProfileParams(CoreModel):
     backends: Annotated[
         Optional[List[BackendType]],
@@ -194,6 +235,10 @@ class ProfileParams(CoreModel):
             )
         ),
     ]
+    utilization_policy: Annotated[
+        Optional[UtilizationPolicy],
+        Field(description="Run termination policy based on utilization"),
+    ]
     # Deprecated:
     termination_policy: Annotated[
         Optional[TerminationPolicy],

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -23,6 +23,7 @@ from dstack._internal.core.models.profiles import (
     ProfileRetryPolicy,
     RetryEvent,
     SpotPolicy,
+    UtilizationPolicy,
 )
 from dstack._internal.core.models.repos import AnyRunRepoData
 from dstack._internal.core.models.resources import Memory, ResourcesSpec
@@ -114,6 +115,7 @@ class JobTerminationReason(str, Enum):
     ABORTED_BY_USER = "aborted_by_user"
     TERMINATED_BY_SERVER = "terminated_by_server"
     INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
+    TERMINATED_DUE_TO_UTILIZATION_POLICY = "terminated_due_to_utilization_policy"
     # Set by the runner
     CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
     PORTS_BINDING_FAILED = "ports_binding_failed"
@@ -135,6 +137,7 @@ class JobTerminationReason(str, Enum):
             self.ABORTED_BY_USER: JobStatus.ABORTED,
             self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
             self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
+            self.TERMINATED_DUE_TO_UTILIZATION_POLICY: JobStatus.TERMINATED,
             self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
             self.PORTS_BINDING_FAILED: JobStatus.FAILED,
             self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
     single_branch: Optional[bool] = None
     max_duration: Optional[int]
     stop_duration: Optional[int] = None
+    utilization_policy: Optional[UtilizationPolicy] = None
     registry_auth: Optional[RegistryAuth]
     requirements: Requirements
     retry: Optional[Retry]

dstack/_internal/server/app.py CHANGED Viewed

@@ -29,6 +29,7 @@ from dstack._internal.server.routers import (
     metrics,
     pools,
     projects,
+    prometheus,
     repos,
     runs,
     secrets,
@@ -185,6 +186,7 @@ def register_routes(app: FastAPI, ui: bool = True):
     app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
     app.include_router(pools.root_router)
     app.include_router(pools.router)
+    app.include_router(prometheus.router)
     @app.exception_handler(ForbiddenError)
     async def forbidden_error_handler(request: Request, exc: ForbiddenError):
@@ -252,7 +254,11 @@ def register_routes(app: FastAPI, ui: bool = True):
         @app.exception_handler(404)
         async def custom_http_exception_handler(request, exc):
-            if request.url.path.startswith("/api") or _is_proxy_request(request):
+            if (
+                request.url.path.startswith("/api")
+                or _is_proxy_request(request)
+                or _is_prometheus_request(request)
+            ):
                 return JSONResponse(
                     {"detail": exc.detail},
                     status_code=status.HTTP_404_NOT_FOUND,
@@ -283,6 +289,10 @@ def _is_proxy_request(request: Request) -> bool:
     ) and referrer.path.startswith("/proxy")
+def _is_prometheus_request(request: Request) -> bool:
+    return request.url.path.startswith("/metrics")
 def _print_dstack_logo():
     console.print(
         """[purple]╱╱╭╮╱╱╭╮╱╱╱╱╱╱╭╮

dstack/_internal/server/background/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 from apscheduler.triggers.interval import IntervalTrigger
+from dstack._internal.server import settings
 from dstack._internal.server.background.tasks.process_fleets import process_fleets
 from dstack._internal.server.background.tasks.process_gateways import (
     process_gateways_connections,
@@ -16,6 +17,10 @@ from dstack._internal.server.background.tasks.process_metrics import (
 from dstack._internal.server.background.tasks.process_placement_groups import (
     process_placement_groups,
 )
+from dstack._internal.server.background.tasks.process_prometheus_metrics import (
+    collect_prometheus_metrics,
+    delete_prometheus_metrics,
+)
 from dstack._internal.server.background.tasks.process_running_jobs import process_running_jobs
 from dstack._internal.server.background.tasks.process_runs import process_runs
 from dstack._internal.server.background.tasks.process_submitted_jobs import process_submitted_jobs
@@ -43,6 +48,11 @@ def start_background_tasks() -> AsyncIOScheduler:
     # * 150 active instances with up to 2 minutes processing latency
     _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
     _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
+    if settings.ENABLE_PROMETHEUS_METRICS:
+        _scheduler.add_job(
+            collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
+        )
+        _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
     # process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
     _scheduler.add_job(
         process_submitted_jobs,

dstack/_internal/server/background/tasks/process_placement_groups.py CHANGED Viewed

@@ -28,6 +28,7 @@ async def process_placement_groups():
                     PlacementGroupModel.deleted == False,
                     PlacementGroupModel.id.not_in(lockset),
                 )
+                .order_by(PlacementGroupModel.id)  # take locks in order
                 .with_for_update(skip_locked=True)
             )
             placement_group_models = res.scalars().all()

dstack/_internal/server/background/tasks/process_prometheus_metrics.py ADDED Viewed

@@ -0,0 +1,135 @@
+import uuid
+from datetime import datetime, timedelta
+from typing import Optional
+import sqlalchemy.exc
+from sqlalchemy import delete, or_, select, update
+from sqlalchemy.orm import joinedload
+from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
+from dstack._internal.core.models.runs import JobStatus
+from dstack._internal.server.db import get_session_ctx
+from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
+from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
+from dstack._internal.server.services.pools import get_instance_ssh_private_keys
+from dstack._internal.server.services.runner import client
+from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
+from dstack._internal.server.utils.common import gather_map_async
+from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
+MAX_JOBS_FETCHED = 100
+BATCH_SIZE = 10
+MIN_COLLECT_INTERVAL_SECONDS = 9
+# 10 minutes should be more than enough to scrape metrics, and, in any case,
+# 10 minutes old metrics has little to no value
+METRICS_TTL_SECONDS = 600
+async def collect_prometheus_metrics():
+    now = get_current_datetime()
+    cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
+    async with get_session_ctx() as session:
+        res = await session.execute(
+            select(JobModel)
+            .join(JobPrometheusMetrics, isouter=True)
+            .where(
+                JobModel.status.in_([JobStatus.RUNNING]),
+                or_(
+                    JobPrometheusMetrics.job_id.is_(None),
+                    JobPrometheusMetrics.collected_at < cutoff,
+                ),
+            )
+            .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
+            .order_by(JobModel.last_processed_at.asc())
+            .limit(MAX_JOBS_FETCHED)
+        )
+        job_models = res.unique().scalars().all()
+    for batch in batched(job_models, BATCH_SIZE):
+        await _collect_jobs_metrics(batch, now)
+async def delete_prometheus_metrics():
+    now = get_current_datetime()
+    cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)
+    async with get_session_ctx() as session:
+        await session.execute(
+            delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff)
+        )
+        await session.commit()
+async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime):
+    results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True)
+    async with get_session_ctx() as session:
+        for job_model, result in results:
+            if result is None:
+                continue
+            if isinstance(result, BaseException):
+                logger.error(
+                    "Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result
+                )
+                continue
+            res = await session.execute(
+                update(JobPrometheusMetrics)
+                .where(JobPrometheusMetrics.job_id == job_model.id)
+                .values(
+                    collected_at=collected_at,
+                    text=result,
+                )
+                .returning(JobPrometheusMetrics)
+            )
+            metrics = res.scalar()
+            if metrics is None:
+                metrics = JobPrometheusMetrics(
+                    job_id=job_model.id,
+                    collected_at=collected_at,
+                    text=result,
+                )
+                try:
+                    async with session.begin_nested():
+                        session.add(metrics)
+                except sqlalchemy.exc.IntegrityError:
+                    # Concurrent server replica already committed, ignoring
+                    pass
+        await session.commit()
+async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
+    ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
+    jpd = get_job_provisioning_data(job_model)
+    jrd = get_job_runtime_data(job_model)
+    if jpd is None:
+        return None
+    try:
+        res = await run_async(
+            _pull_job_metrics,
+            ssh_private_keys,
+            jpd,
+            jrd,
+            job_model.id,
+        )
+    except Exception:
+        logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name)
+        return None
+    if isinstance(res, bool):
+        logger.warning(
+            "Failed to connect to job %s to collect Prometheus metrics", job_model.job_name
+        )
+        return None
+    if res is None:
+        # Either not supported by shim or exporter is not available
+        return None
+    return res
+@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
+def _pull_job_metrics(ports: dict[int, int], task_id: uuid.UUID) -> Optional[str]:
+    shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
+    return shim_client.get_task_metrics(task_id)

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import asyncio
+from collections.abc import Iterable
+from datetime import timedelta
 from typing import Dict, List, Optional
 from sqlalchemy import select
@@ -15,6 +17,7 @@ from dstack._internal.core.models.instances import (
     RemoteConnectionInfo,
     SSHConnectionParams,
 )
+from dstack._internal.core.models.metrics import Metric
 from dstack._internal.core.models.repos import RemoteRepoCreds
 from dstack._internal.core.models.runs import (
     ClusterInfo,
@@ -48,6 +51,7 @@ from dstack._internal.server.services.jobs import (
 )
 from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.logging import fmt
+from dstack._internal.server.services.metrics import get_job_metrics
 from dstack._internal.server.services.pools import get_instance_ssh_private_keys
 from dstack._internal.server.services.repos import (
     get_code_model,
@@ -343,6 +347,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
             job_model.status = JobStatus.TERMINATING
             job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
+    if job_model.status == JobStatus.RUNNING:
+        await _check_gpu_utilization(session, job_model, job)
     job_model.last_processed_at = common_utils.get_current_datetime()
     await session.commit()
@@ -646,27 +653,67 @@ def _terminate_if_inactivity_duration_exceeded(
     run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
 ) -> None:
     conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
-    if is_core_model_instance(conf, DevEnvironmentConfiguration) and isinstance(
+    if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
         conf.inactivity_duration, int
     ):
-        logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
-        job_model.inactivity_secs = no_connections_secs
-        if no_connections_secs is None:
-            # TODO(0.19 or earlier): make no_connections_secs required
-            job_model.status = JobStatus.TERMINATING
-            job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
-            job_model.termination_reason_message = (
-                "The selected instance was created before dstack 0.18.41"
-                " and does not support inactivity_duration"
-            )
-        elif no_connections_secs >= conf.inactivity_duration:
-            job_model.status = JobStatus.TERMINATING
-            # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
-            job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
-            job_model.termination_reason_message = (
-                f"The job was inactive for {no_connections_secs} seconds,"
-                f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
-            )
+        # reset in case inactivity_duration was disabled via in-place update
+        job_model.inactivity_secs = None
+        return
+    logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
+    job_model.inactivity_secs = no_connections_secs
+    if no_connections_secs is None:
+        # TODO(0.19 or earlier): make no_connections_secs required
+        job_model.status = JobStatus.TERMINATING
+        job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
+        job_model.termination_reason_message = (
+            "The selected instance was created before dstack 0.18.41"
+            " and does not support inactivity_duration"
+        )
+    elif no_connections_secs >= conf.inactivity_duration:
+        job_model.status = JobStatus.TERMINATING
+        # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
+        job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
+        job_model.termination_reason_message = (
+            f"The job was inactive for {no_connections_secs} seconds,"
+            f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
+        )
+async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
+    policy = job.job_spec.utilization_policy
+    if policy is None:
+        return
+    after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
+    job_metrics = await get_job_metrics(session, job_model, after=after)
+    gpus_util_metrics: list[Metric] = []
+    for metric in job_metrics.metrics:
+        if metric.name.startswith("gpu_util_percent_gpu"):
+            gpus_util_metrics.append(metric)
+    if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
+        # Job has started recently, not enough points collected.
+        # Assuming that metrics collection interval less than 1 minute.
+        logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
+        return
+    if _should_terminate_due_to_low_gpu_util(
+        policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
+    ):
+        logger.info("%s: GPU utilization check: terminating", fmt(job_model))
+        job_model.status = JobStatus.TERMINATING
+        # TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
+        job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
+        job_model.termination_reason_message = (
+            f"The job GPU utilization below {policy.min_gpu_utilization}%"
+            f" for {policy.time_window} seconds"
+        )
+    else:
+        logger.debug("%s: GPU utilization check: OK", fmt(job_model))
+def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
+    for gpu_util in gpus_util:
+        if all(util < min_util for util in gpu_util):
+            return True
+    return False
 def _get_cluster_info(

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -74,6 +74,7 @@ async def _process_next_run():
                     JobModel.run_id == run_model.id,
                     JobModel.id.not_in(job_lockset),
                 )
+                .order_by(JobModel.id)  # take locks in order
                 .with_for_update(skip_locked=True)
             )
             job_models = res.scalars().all()

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -35,6 +35,7 @@ from dstack._internal.core.models.runs import (
 )
 from dstack._internal.core.models.volumes import Volume
 from dstack._internal.core.services.profiles import get_termination
+from dstack._internal.server import settings
 from dstack._internal.server.db import get_db, get_session_ctx
 from dstack._internal.server.models import (
     FleetModel,
@@ -195,6 +196,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
                 InstanceModel.total_blocks > InstanceModel.busy_blocks,
             )
             .options(lazyload(InstanceModel.jobs))
+            .order_by(InstanceModel.id)  # take locks in order
             .with_for_update()
         )
         pool_instances = list(res.unique().scalars().all())
@@ -319,6 +321,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
         select(VolumeModel)
         .where(VolumeModel.id.in_(volumes_ids))
         .options(selectinload(VolumeModel.user))
+        .order_by(VolumeModel.id)  # take locks in order
         .with_for_update()
     )
     async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
@@ -450,7 +453,7 @@ async def _run_job_on_new_instance(
     )
     # Limit number of offers tried to prevent long-running processing
     # in case all offers fail.
-    for backend, offer in offers[:15]:
+    for backend, offer in offers[: settings.MAX_OFFERS_TRIED]:
         logger.debug(
             "%s: trying %s in %s/%s for $%0.4f per hour",
             fmt(job_model),

dstack 0.18.43__py3-none-any.whl → 0.18.44__py3-none-any.whl

Potentially problematic release.

dstack 0.18.43py3-none-any.whl → 0.18.44py3-none-any.whl