PyPI - dstack - Versions diffs - 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl - Mend

dstack 0.18.42py3-none-any.whl → 0.18.44py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (115) hide show

dstack/_internal/server/background/tasks/process_instances.py CHANGED Viewed

@@ -45,7 +45,6 @@ from dstack._internal.core.models.instances import (
     InstanceOfferWithAvailability,
     InstanceRuntime,
     InstanceStatus,
-    InstanceType,
     RemoteConnectionInfo,
     SSHKey,
 )
@@ -63,6 +62,7 @@ from dstack._internal.core.models.runs import (
     Retry,
 )
 from dstack._internal.core.services.profiles import get_retry
+from dstack._internal.server.background.tasks.common import get_provisioning_timeout
 from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import (
     FleetModel,
@@ -695,7 +695,8 @@ async def _check_instance(instance: InstanceModel) -> None:
     if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
         provisioning_deadline = _get_provisioning_deadline(
-            instance, job_provisioning_data.instance_type
+            instance=instance,
+            job_provisioning_data=job_provisioning_data,
         )
         if get_current_datetime() > provisioning_deadline:
             instance.status = InstanceStatus.TERMINATING
@@ -737,7 +738,8 @@ async def _wait_for_instance_provisioning_data(
         instance.name,
     )
     provisioning_deadline = _get_provisioning_deadline(
-        instance, job_provisioning_data.instance_type
+        instance=instance,
+        job_provisioning_data=job_provisioning_data,
     )
     if get_current_datetime() > provisioning_deadline:
         logger.warning(
@@ -959,24 +961,15 @@ def _get_retry_duration_deadline(instance: InstanceModel, retry: Retry) -> datet
 def _get_provisioning_deadline(
-    instance: InstanceModel, instance_type: InstanceType
+    instance: InstanceModel,
+    job_provisioning_data: JobProvisioningData,
 ) -> datetime.datetime:
-    timeout_interval = _get_instance_timeout_interval(instance.backend, instance_type.name)
+    timeout_interval = get_provisioning_timeout(
+        backend_type=job_provisioning_data.get_base_backend(),
+        instance_type_name=job_provisioning_data.instance_type.name,
+    )
     return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
-def _get_instance_timeout_interval(
-    backend_type: BackendType, instance_type_name: str
-) -> timedelta:
-    # when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
-    if backend_type == BackendType.RUNPOD:
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
-        return timedelta(seconds=3300)
-    return timedelta(seconds=600)
 def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
     return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]

dstack/_internal/server/background/tasks/process_placement_groups.py CHANGED Viewed

@@ -28,6 +28,7 @@ async def process_placement_groups():
                     PlacementGroupModel.deleted == False,
                     PlacementGroupModel.id.not_in(lockset),
                 )
+                .order_by(PlacementGroupModel.id)  # take locks in order
                 .with_for_update(skip_locked=True)
             )
             placement_group_models = res.scalars().all()

dstack/_internal/server/background/tasks/process_prometheus_metrics.py ADDED Viewed

@@ -0,0 +1,135 @@
+import uuid
+from datetime import datetime, timedelta
+from typing import Optional
+import sqlalchemy.exc
+from sqlalchemy import delete, or_, select, update
+from sqlalchemy.orm import joinedload
+from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
+from dstack._internal.core.models.runs import JobStatus
+from dstack._internal.server.db import get_session_ctx
+from dstack._internal.server.models import InstanceModel, JobModel, JobPrometheusMetrics
+from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
+from dstack._internal.server.services.pools import get_instance_ssh_private_keys
+from dstack._internal.server.services.runner import client
+from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
+from dstack._internal.server.utils.common import gather_map_async
+from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
+MAX_JOBS_FETCHED = 100
+BATCH_SIZE = 10
+MIN_COLLECT_INTERVAL_SECONDS = 9
+# 10 minutes should be more than enough to scrape metrics, and, in any case,
+# 10 minutes old metrics has little to no value
+METRICS_TTL_SECONDS = 600
+async def collect_prometheus_metrics():
+    now = get_current_datetime()
+    cutoff = now - timedelta(seconds=MIN_COLLECT_INTERVAL_SECONDS)
+    async with get_session_ctx() as session:
+        res = await session.execute(
+            select(JobModel)
+            .join(JobPrometheusMetrics, isouter=True)
+            .where(
+                JobModel.status.in_([JobStatus.RUNNING]),
+                or_(
+                    JobPrometheusMetrics.job_id.is_(None),
+                    JobPrometheusMetrics.collected_at < cutoff,
+                ),
+            )
+            .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
+            .order_by(JobModel.last_processed_at.asc())
+            .limit(MAX_JOBS_FETCHED)
+        )
+        job_models = res.unique().scalars().all()
+    for batch in batched(job_models, BATCH_SIZE):
+        await _collect_jobs_metrics(batch, now)
+async def delete_prometheus_metrics():
+    now = get_current_datetime()
+    cutoff = now - timedelta(seconds=METRICS_TTL_SECONDS)
+    async with get_session_ctx() as session:
+        await session.execute(
+            delete(JobPrometheusMetrics).where(JobPrometheusMetrics.collected_at < cutoff)
+        )
+        await session.commit()
+async def _collect_jobs_metrics(job_models: list[JobModel], collected_at: datetime):
+    results = await gather_map_async(job_models, _collect_job_metrics, return_exceptions=True)
+    async with get_session_ctx() as session:
+        for job_model, result in results:
+            if result is None:
+                continue
+            if isinstance(result, BaseException):
+                logger.error(
+                    "Failed to collect job %s Prometheus metrics: %r", job_model.job_name, result
+                )
+                continue
+            res = await session.execute(
+                update(JobPrometheusMetrics)
+                .where(JobPrometheusMetrics.job_id == job_model.id)
+                .values(
+                    collected_at=collected_at,
+                    text=result,
+                )
+                .returning(JobPrometheusMetrics)
+            )
+            metrics = res.scalar()
+            if metrics is None:
+                metrics = JobPrometheusMetrics(
+                    job_id=job_model.id,
+                    collected_at=collected_at,
+                    text=result,
+                )
+                try:
+                    async with session.begin_nested():
+                        session.add(metrics)
+                except sqlalchemy.exc.IntegrityError:
+                    # Concurrent server replica already committed, ignoring
+                    pass
+        await session.commit()
+async def _collect_job_metrics(job_model: JobModel) -> Optional[str]:
+    ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
+    jpd = get_job_provisioning_data(job_model)
+    jrd = get_job_runtime_data(job_model)
+    if jpd is None:
+        return None
+    try:
+        res = await run_async(
+            _pull_job_metrics,
+            ssh_private_keys,
+            jpd,
+            jrd,
+            job_model.id,
+        )
+    except Exception:
+        logger.exception("Failed to collect job %s Prometheus metrics", job_model.job_name)
+        return None
+    if isinstance(res, bool):
+        logger.warning(
+            "Failed to connect to job %s to collect Prometheus metrics", job_model.job_name
+        )
+        return None
+    if res is None:
+        # Either not supported by shim or exporter is not available
+        return None
+    return res
+@runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
+def _pull_job_metrics(ports: dict[int, int], task_id: uuid.UUID) -> Optional[str]:
+    shim_client = client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
+    return shim_client.get_task_metrics(task_id)

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+from collections.abc import Iterable
 from datetime import timedelta
 from typing import Dict, List, Optional
@@ -16,11 +17,13 @@ from dstack._internal.core.models.instances import (
     RemoteConnectionInfo,
     SSHConnectionParams,
 )
+from dstack._internal.core.models.metrics import Metric
 from dstack._internal.core.models.repos import RemoteRepoCreds
 from dstack._internal.core.models.runs import (
     ClusterInfo,
     Job,
     JobProvisioningData,
+    JobRuntimeData,
     JobSpec,
     JobStatus,
     JobTerminationReason,
@@ -28,6 +31,7 @@ from dstack._internal.core.models.runs import (
     RunSpec,
 )
 from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
+from dstack._internal.server.background.tasks.common import get_provisioning_timeout
 from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import (
     InstanceModel,
@@ -47,6 +51,7 @@ from dstack._internal.server.services.jobs import (
 )
 from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.logging import fmt
+from dstack._internal.server.services.metrics import get_job_metrics
 from dstack._internal.server.services.pools import get_instance_ssh_private_keys
 from dstack._internal.server.services.repos import (
     get_code_model,
@@ -148,6 +153,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         jobs=run.jobs,
         replica_num=job.job_spec.replica_num,
         job_provisioning_data=job_provisioning_data,
+        job_runtime_data=job_submission.job_runtime_data,
     )
     volumes = await get_job_attached_volumes(
@@ -242,7 +248,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
             if not success:
                 # check timeout
-                if job_submission.age > _get_runner_timeout_interval(
+                if job_submission.age > get_provisioning_timeout(
                     backend_type=job_provisioning_data.get_base_backend(),
                     instance_type_name=job_provisioning_data.instance_type.name,
                 ):
@@ -341,6 +347,9 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
             job_model.status = JobStatus.TERMINATING
             job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
+    if job_model.status == JobStatus.RUNNING:
+        await _check_gpu_utilization(session, job_model, job)
     job_model.last_processed_at = common_utils.get_current_datetime()
     await session.commit()
@@ -644,33 +653,74 @@ def _terminate_if_inactivity_duration_exceeded(
     run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
 ) -> None:
     conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
-    if is_core_model_instance(conf, DevEnvironmentConfiguration) and isinstance(
+    if not is_core_model_instance(conf, DevEnvironmentConfiguration) or not isinstance(
         conf.inactivity_duration, int
     ):
-        logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
-        job_model.inactivity_secs = no_connections_secs
-        if no_connections_secs is None:
-            # TODO(0.19 or earlier): make no_connections_secs required
-            job_model.status = JobStatus.TERMINATING
-            job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
-            job_model.termination_reason_message = (
-                "The selected instance was created before dstack 0.18.41"
-                " and does not support inactivity_duration"
-            )
-        elif no_connections_secs >= conf.inactivity_duration:
-            job_model.status = JobStatus.TERMINATING
-            # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
-            job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
-            job_model.termination_reason_message = (
-                f"The job was inactive for {no_connections_secs} seconds,"
-                f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
-            )
+        # reset in case inactivity_duration was disabled via in-place update
+        job_model.inactivity_secs = None
+        return
+    logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
+    job_model.inactivity_secs = no_connections_secs
+    if no_connections_secs is None:
+        # TODO(0.19 or earlier): make no_connections_secs required
+        job_model.status = JobStatus.TERMINATING
+        job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
+        job_model.termination_reason_message = (
+            "The selected instance was created before dstack 0.18.41"
+            " and does not support inactivity_duration"
+        )
+    elif no_connections_secs >= conf.inactivity_duration:
+        job_model.status = JobStatus.TERMINATING
+        # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
+        job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
+        job_model.termination_reason_message = (
+            f"The job was inactive for {no_connections_secs} seconds,"
+            f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
+        )
+async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
+    policy = job.job_spec.utilization_policy
+    if policy is None:
+        return
+    after = common_utils.get_current_datetime() - timedelta(seconds=policy.time_window)
+    job_metrics = await get_job_metrics(session, job_model, after=after)
+    gpus_util_metrics: list[Metric] = []
+    for metric in job_metrics.metrics:
+        if metric.name.startswith("gpu_util_percent_gpu"):
+            gpus_util_metrics.append(metric)
+    if not gpus_util_metrics or gpus_util_metrics[0].timestamps[-1] > after + timedelta(minutes=1):
+        # Job has started recently, not enough points collected.
+        # Assuming that metrics collection interval less than 1 minute.
+        logger.debug("%s: GPU utilization check: not enough samples", fmt(job_model))
+        return
+    if _should_terminate_due_to_low_gpu_util(
+        policy.min_gpu_utilization, [m.values for m in gpus_util_metrics]
+    ):
+        logger.info("%s: GPU utilization check: terminating", fmt(job_model))
+        job_model.status = JobStatus.TERMINATING
+        # TODO(0.19 or earlier): set JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
+        job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
+        job_model.termination_reason_message = (
+            f"The job GPU utilization below {policy.min_gpu_utilization}%"
+            f" for {policy.time_window} seconds"
+        )
+    else:
+        logger.debug("%s: GPU utilization check: OK", fmt(job_model))
+def _should_terminate_due_to_low_gpu_util(min_util: int, gpus_util: Iterable[Iterable[int]]):
+    for gpu_util in gpus_util:
+        if all(util < min_util for util in gpu_util):
+            return True
+    return False
 def _get_cluster_info(
     jobs: List[Job],
     replica_num: int,
     job_provisioning_data: JobProvisioningData,
+    job_runtime_data: Optional[JobRuntimeData],
 ) -> ClusterInfo:
     job_ips = []
     for job in jobs:
@@ -681,10 +731,13 @@ def _get_cluster_info(
                 ).internal_ip
                 or ""
             )
+    gpus_per_job = len(job_provisioning_data.instance_type.resources.gpus)
+    if job_runtime_data is not None and job_runtime_data.offer is not None:
+        gpus_per_job = len(job_runtime_data.offer.instance.resources.gpus)
     cluster_info = ClusterInfo(
         job_ips=job_ips,
         master_job_ip=job_ips[0],
-        gpus_per_job=len(job_provisioning_data.instance_type.resources.gpus),
+        gpus_per_job=gpus_per_job,
     )
     return cluster_info
@@ -763,16 +816,3 @@ def _submit_job_to_runner(
     # do not log here, because the runner will send a new status
     return True
-def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
-    # when changing timeouts, also consider process_instances._get_instance_timeout_interval
-    if backend_type == BackendType.LAMBDA:
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.KUBERNETES:
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
-        return timedelta(seconds=3300)
-    return timedelta(seconds=600)

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -74,6 +74,7 @@ async def _process_next_run():
                     JobModel.run_id == run_model.id,
                     JobModel.id.not_in(job_lockset),
                 )
+                .order_by(JobModel.id)  # take locks in order
                 .with_for_update(skip_locked=True)
             )
             job_models = res.scalars().all()

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -35,6 +35,7 @@ from dstack._internal.core.models.runs import (
 )
 from dstack._internal.core.models.volumes import Volume
 from dstack._internal.core.services.profiles import get_termination
+from dstack._internal.server import settings
 from dstack._internal.server.db import get_db, get_session_ctx
 from dstack._internal.server.models import (
     FleetModel,
@@ -195,6 +196,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
                 InstanceModel.total_blocks > InstanceModel.busy_blocks,
             )
             .options(lazyload(InstanceModel.jobs))
+            .order_by(InstanceModel.id)  # take locks in order
             .with_for_update()
         )
         pool_instances = list(res.unique().scalars().all())
@@ -319,6 +321,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
         select(VolumeModel)
         .where(VolumeModel.id.in_(volumes_ids))
         .options(selectinload(VolumeModel.user))
+        .order_by(VolumeModel.id)  # take locks in order
         .with_for_update()
     )
     async with get_locker().lock_ctx(VolumeModel.__tablename__, volumes_ids):
@@ -450,7 +453,7 @@ async def _run_job_on_new_instance(
     )
     # Limit number of offers tried to prevent long-running processing
     # in case all offers fail.
-    for backend, offer in offers[:15]:
+    for backend, offer in offers[: settings.MAX_OFFERS_TRIED]:
         logger.debug(
             "%s: trying %s in %s/%s for $%0.4f per hour",
             fmt(job_model),

dstack/_internal/server/background/tasks/process_terminating_jobs.py CHANGED Viewed

@@ -11,7 +11,6 @@ from dstack._internal.server.models import (
     JobModel,
     ProjectModel,
     VolumeAttachmentModel,
-    VolumeModel,
 )
 from dstack._internal.server.services.jobs import (
     process_terminating_job,
@@ -86,12 +85,7 @@ async def _process_job(session: AsyncSession, job_model: JobModel):
         .where(InstanceModel.id == job_model.used_instance_id)
         .options(
             joinedload(InstanceModel.project).joinedload(ProjectModel.backends),
-            joinedload(InstanceModel.volume_attachments)
-            .joinedload(VolumeAttachmentModel.volume)
-            .joinedload(VolumeModel.user),
-            joinedload(InstanceModel.volume_attachments)
-            .joinedload(VolumeAttachmentModel.volume)
-            .joinedload(VolumeModel.attachments),
+            joinedload(InstanceModel.volume_attachments).joinedload(VolumeAttachmentModel.volume),
         )
     )
     instance_model = res.unique().scalar()

dstack/_internal/server/migrations/versions/60e444118b6d_add_jobprometheusmetrics.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""Add JobPrometheusMetrics
+Revision ID: 60e444118b6d
+Revises: a751ef183f27
+Create Date: 2025-02-21 10:59:26.339353
+"""
+import sqlalchemy as sa
+import sqlalchemy_utils
+from alembic import op
+import dstack._internal.server.models
+# revision identifiers, used by Alembic.
+revision = "60e444118b6d"
+down_revision = "a751ef183f27"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "job_prometheus_metrics",
+        sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
+        sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
+        sa.Column("text", sa.Text(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["job_id"], ["jobs.id"], name=op.f("fk_job_prometheus_metrics_job_id_jobs")
+        ),
+        sa.PrimaryKeyConstraint("job_id", name=op.f("pk_job_prometheus_metrics")),
+    )
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("job_prometheus_metrics")
+    # ### end Alembic commands ###

dstack/_internal/server/migrations/versions/98d1b92988bc_add_jobterminationreason_terminated_due_.py ADDED Viewed

@@ -0,0 +1,140 @@
+"""Add JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY
+Revision ID: 98d1b92988bc
+Revises: 60e444118b6d
+Create Date: 2025-02-28 15:12:37.649876
+"""
+import sqlalchemy as sa
+from alembic import op
+from alembic_postgresql_enum import TableReference
+# revision identifiers, used by Alembic.
+revision = "98d1b92988bc"
+down_revision = "60e444118b6d"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # SQLite
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.alter_column(
+            "termination_reason",
+            existing_type=sa.VARCHAR(length=34),
+            type_=sa.Enum(
+                "FAILED_TO_START_DUE_TO_NO_CAPACITY",
+                "INTERRUPTED_BY_NO_CAPACITY",
+                "WAITING_INSTANCE_LIMIT_EXCEEDED",
+                "WAITING_RUNNER_LIMIT_EXCEEDED",
+                "TERMINATED_BY_USER",
+                "VOLUME_ERROR",
+                "GATEWAY_ERROR",
+                "SCALED_DOWN",
+                "DONE_BY_RUNNER",
+                "ABORTED_BY_USER",
+                "TERMINATED_BY_SERVER",
+                "INACTIVITY_DURATION_EXCEEDED",
+                "TERMINATED_DUE_TO_UTILIZATION_POLICY",
+                "CONTAINER_EXITED_WITH_ERROR",
+                "PORTS_BINDING_FAILED",
+                "CREATING_CONTAINER_ERROR",
+                "EXECUTOR_ERROR",
+                "MAX_DURATION_EXCEEDED",
+                name="jobterminationreason",
+            ),
+            existing_nullable=True,
+        )
+    # PostgreSQL
+    op.sync_enum_values(
+        enum_schema="public",
+        enum_name="jobterminationreason",
+        new_values=[
+            "FAILED_TO_START_DUE_TO_NO_CAPACITY",
+            "INTERRUPTED_BY_NO_CAPACITY",
+            "WAITING_INSTANCE_LIMIT_EXCEEDED",
+            "WAITING_RUNNER_LIMIT_EXCEEDED",
+            "TERMINATED_BY_USER",
+            "VOLUME_ERROR",
+            "GATEWAY_ERROR",
+            "SCALED_DOWN",
+            "DONE_BY_RUNNER",
+            "ABORTED_BY_USER",
+            "TERMINATED_BY_SERVER",
+            "INACTIVITY_DURATION_EXCEEDED",
+            "TERMINATED_DUE_TO_UTILIZATION_POLICY",
+            "CONTAINER_EXITED_WITH_ERROR",
+            "PORTS_BINDING_FAILED",
+            "CREATING_CONTAINER_ERROR",
+            "EXECUTOR_ERROR",
+            "MAX_DURATION_EXCEEDED",
+        ],
+        affected_columns=[
+            TableReference(
+                table_schema="public", table_name="jobs", column_name="termination_reason"
+            )
+        ],
+        enum_values_to_rename=[],
+    )
+def downgrade() -> None:
+    # SQLite
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.alter_column(
+            "termination_reason",
+            existing_type=sa.Enum(
+                "FAILED_TO_START_DUE_TO_NO_CAPACITY",
+                "INTERRUPTED_BY_NO_CAPACITY",
+                "WAITING_INSTANCE_LIMIT_EXCEEDED",
+                "WAITING_RUNNER_LIMIT_EXCEEDED",
+                "TERMINATED_BY_USER",
+                "VOLUME_ERROR",
+                "GATEWAY_ERROR",
+                "SCALED_DOWN",
+                "DONE_BY_RUNNER",
+                "ABORTED_BY_USER",
+                "TERMINATED_BY_SERVER",
+                "INACTIVITY_DURATION_EXCEEDED",
+                "TERMINATED_DUE_TO_UTILIZATION_POLICY",
+                "CONTAINER_EXITED_WITH_ERROR",
+                "PORTS_BINDING_FAILED",
+                "CREATING_CONTAINER_ERROR",
+                "EXECUTOR_ERROR",
+                "MAX_DURATION_EXCEEDED",
+                name="jobterminationreason",
+            ),
+            type_=sa.VARCHAR(length=34),
+            existing_nullable=True,
+        )
+    # PostgreSQL
+    op.sync_enum_values(
+        enum_schema="public",
+        enum_name="jobterminationreason",
+        new_values=[
+            "FAILED_TO_START_DUE_TO_NO_CAPACITY",
+            "INTERRUPTED_BY_NO_CAPACITY",
+            "WAITING_INSTANCE_LIMIT_EXCEEDED",
+            "WAITING_RUNNER_LIMIT_EXCEEDED",
+            "TERMINATED_BY_USER",
+            "VOLUME_ERROR",
+            "GATEWAY_ERROR",
+            "SCALED_DOWN",
+            "DONE_BY_RUNNER",
+            "ABORTED_BY_USER",
+            "TERMINATED_BY_SERVER",
+            "INACTIVITY_DURATION_EXCEEDED",
+            "CONTAINER_EXITED_WITH_ERROR",
+            "PORTS_BINDING_FAILED",
+            "CREATING_CONTAINER_ERROR",
+            "EXECUTOR_ERROR",
+            "MAX_DURATION_EXCEEDED",
+        ],
+        affected_columns=[
+            TableReference(
+                table_schema="public", table_name="jobs", column_name="termination_reason"
+            )
+        ],
+        enum_values_to_rename=[],
+    )

dstack/_internal/server/models.py CHANGED Viewed

@@ -648,3 +648,14 @@ class JobMetricsPoint(BaseModel):
     # json-encoded lists of metric values of len(gpus) length
     gpus_memory_usage_bytes: Mapped[str] = mapped_column(Text)
     gpus_util_percent: Mapped[str] = mapped_column(Text)
+class JobPrometheusMetrics(BaseModel):
+    __tablename__ = "job_prometheus_metrics"
+    job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
+    job: Mapped["JobModel"] = relationship()
+    collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
+    # Raw Prometheus text response
+    text: Mapped[str] = mapped_column(Text)

dstack/_internal/server/routers/logs.py CHANGED Viewed

@@ -24,4 +24,7 @@ async def poll_logs(
     user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
 ) -> JobSubmissionLogs:
     _, project = user_project
+    # The runner guarantees logs have different timestamps if throughput < 1k logs / sec.
+    # Otherwise, some logs with duplicated timestamps may be filtered out.
+    # This limitation is imposed by cloud log services that support up to millisecond timestamp resolution.
     return await logs.poll_logs_async(project=project, request=body)

dstack 0.18.42__py3-none-any.whl → 0.18.44__py3-none-any.whl

Potentially problematic release.

dstack 0.18.42py3-none-any.whl → 0.18.44py3-none-any.whl