PyPI - dstack - Versions diffs - 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl - Mend

dstack 0.19.21py3-none-any.whl → 0.19.23rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (71) hide show

dstack/_internal/server/background/tasks/process_instances.py CHANGED Viewed

@@ -1,13 +1,14 @@
 import asyncio
 import datetime
+import logging
 from datetime import timedelta
-from typing import Any, Dict, List, Optional, Tuple, cast
+from typing import Any, Dict, List, Optional, cast
 import requests
 from paramiko.pkey import PKey
 from paramiko.ssh_exception import PasswordRequiredException
 from pydantic import ValidationError
-from sqlalchemy import select
+from sqlalchemy import delete, func, select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
@@ -27,18 +28,6 @@ from dstack._internal.core.backends.features import (
     BACKENDS_WITH_CREATE_INSTANCE_SUPPORT,
     BACKENDS_WITH_PLACEMENT_GROUPS_SUPPORT,
 )
-from dstack._internal.core.backends.remote.provisioning import (
-    detect_cpu_arch,
-    get_host_info,
-    get_paramiko_connection,
-    get_shim_healthcheck,
-    host_info_to_instance_type,
-    remove_dstack_runner_if_exists,
-    remove_host_info_if_exists,
-    run_pre_start_commands,
-    run_shim_as_systemd_service,
-    upload_envs,
-)
 from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
 # FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
@@ -77,12 +66,14 @@ from dstack._internal.server.background.tasks.common import get_provisioning_tim
 from dstack._internal.server.db import get_db, get_session_ctx
 from dstack._internal.server.models import (
     FleetModel,
+    InstanceHealthCheckModel,
     InstanceModel,
     JobModel,
     PlacementGroupModel,
     ProjectModel,
 )
-from dstack._internal.server.schemas.runner import HealthcheckResponse
+from dstack._internal.server.schemas.instances import InstanceCheck
+from dstack._internal.server.schemas.runner import HealthcheckResponse, InstanceHealthResponse
 from dstack._internal.server.services import backends as backends_services
 from dstack._internal.server.services.fleets import (
     fleet_model_to_fleet,
@@ -103,9 +94,20 @@ from dstack._internal.server.services.placement import (
     schedule_fleet_placement_groups_deletion,
 )
 from dstack._internal.server.services.runner import client as runner_client
-from dstack._internal.server.services.runner.client import HealthStatus
 from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
 from dstack._internal.server.utils import sentry_utils
+from dstack._internal.server.utils.provisioning import (
+    detect_cpu_arch,
+    get_host_info,
+    get_paramiko_connection,
+    get_shim_healthcheck,
+    host_info_to_instance_type,
+    remove_dstack_runner_if_exists,
+    remove_host_info_if_exists,
+    run_pre_start_commands,
+    run_shim_as_systemd_service,
+    upload_envs,
+)
 from dstack._internal.utils.common import (
     get_current_datetime,
     get_or_error,
@@ -137,6 +139,17 @@ async def process_instances(batch_size: int = 1):
     await asyncio.gather(*tasks)
+@sentry_utils.instrument_background_task
+async def delete_instance_health_checks():
+    now = get_current_datetime()
+    cutoff = now - timedelta(seconds=server_settings.SERVER_INSTANCE_HEALTH_TTL_SECONDS)
+    async with get_session_ctx() as session:
+        await session.execute(
+            delete(InstanceHealthCheckModel).where(InstanceHealthCheckModel.collected_at < cutoff)
+        )
+        await session.commit()
 @sentry_utils.instrument_background_task
 async def _process_next_instance():
     lock, lockset = get_locker(get_db().dialect_name).get_lockset(InstanceModel.__tablename__)
@@ -415,10 +428,10 @@ async def _add_remote(instance: InstanceModel) -> None:
 def _deploy_instance(
     remote_details: RemoteConnectionInfo,
-    pkeys: List[PKey],
+    pkeys: list[PKey],
     ssh_proxy_pkeys: Optional[list[PKey]],
-    authorized_keys: List[str],
-) -> Tuple[HealthStatus, Dict[str, Any], GoArchType]:
+    authorized_keys: list[str],
+) -> tuple[InstanceCheck, dict[str, Any], GoArchType]:
     with get_paramiko_connection(
         remote_details.ssh_user,
         remote_details.host,
@@ -466,14 +479,14 @@ def _deploy_instance(
         host_info = get_host_info(client, dstack_working_dir)
         logger.debug("Received a host_info %s", host_info)
-        raw_health = get_shim_healthcheck(client)
+        healthcheck_out = get_shim_healthcheck(client)
         try:
-            health_response = HealthcheckResponse.__response__.parse_raw(raw_health)
+            healthcheck = HealthcheckResponse.__response__.parse_raw(healthcheck_out)
         except ValueError as e:
-            raise ProvisioningError("Cannot read HealthcheckResponse") from e
-        health = runner_client.health_response_to_health_status(health_response)
+            raise ProvisioningError(f"Cannot parse HealthcheckResponse: {e}") from e
+        instance_check = runner_client.healthcheck_response_to_instance_check(healthcheck)
-        return health, host_info, arch
+        return instance_check, host_info, arch
 async def _create_instance(session: AsyncSession, instance: InstanceModel) -> None:
@@ -758,29 +771,65 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
     ssh_private_keys = get_instance_ssh_private_keys(instance)
+    health_check_cutoff = get_current_datetime() - timedelta(
+        seconds=server_settings.SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS
+    )
+    res = await session.execute(
+        select(func.count(1)).where(
+            InstanceHealthCheckModel.instance_id == instance.id,
+            InstanceHealthCheckModel.collected_at > health_check_cutoff,
+        )
+    )
+    check_instance_health = res.scalar_one() == 0
     # May return False if fails to establish ssh connection
-    health_status_response = await run_async(
-        _instance_healthcheck,
+    instance_check = await run_async(
+        _check_instance_inner,
         ssh_private_keys,
         job_provisioning_data,
         None,
+        check_instance_health=check_instance_health,
     )
-    if isinstance(health_status_response, bool) or health_status_response is None:
-        health_status = HealthStatus(healthy=False, reason="SSH or tunnel error")
-    else:
-        health_status = health_status_response
+    if instance_check is False:
+        instance_check = InstanceCheck(reachable=False, message="SSH or tunnel error")
-    logger.debug(
-        "Check instance %s status. shim health: %s",
+    if instance_check.reachable and check_instance_health:
+        health_status = instance_check.get_health_status()
+    else:
+        # Keep previous health status
+        health_status = instance.health
+    loglevel = logging.DEBUG
+    if not instance_check.reachable and instance.status.is_available():
+        loglevel = logging.WARNING
+    elif check_instance_health and not health_status.is_healthy():
+        loglevel = logging.WARNING
+    logger.log(
+        loglevel,
+        "Instance %s check: reachable=%s health_status=%s message=%r",
         instance.name,
-        health_status,
-        extra={"instance_name": instance.name, "shim_health": health_status},
+        instance_check.reachable,
+        health_status.name,
+        instance_check.message,
+        extra={"instance_name": instance.name, "health_status": health_status},
     )
-    if health_status.healthy:
+    if instance_check.has_health_checks():
+        # ensured by has_health_checks()
+        assert instance_check.health_response is not None
+        health_check_model = InstanceHealthCheckModel(
+            instance_id=instance.id,
+            collected_at=get_current_datetime(),
+            status=health_status,
+            response=instance_check.health_response.json(),
+        )
+        session.add(health_check_model)
+    instance.health = health_status
+    instance.unreachable = not instance_check.reachable
+    if instance_check.reachable:
         instance.termination_deadline = None
-        instance.health_status = None
-        instance.unreachable = False
         if instance.status == InstanceStatus.PROVISIONING:
             instance.status = InstanceStatus.IDLE if not instance.jobs else InstanceStatus.BUSY
@@ -798,9 +847,6 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
     if instance.termination_deadline is None:
         instance.termination_deadline = get_current_datetime() + TERMINATION_DEADLINE_OFFSET
-    instance.health_status = health_status.reason
-    instance.unreachable = True
     if instance.status == InstanceStatus.PROVISIONING and instance.started_at is not None:
         provisioning_deadline = _get_provisioning_deadline(
             instance=instance,
@@ -816,12 +862,7 @@ async def _check_instance(session: AsyncSession, instance: InstanceModel) -> Non
                     "instance_status": InstanceStatus.TERMINATING.value,
                 },
             )
-    elif instance.status in (InstanceStatus.IDLE, InstanceStatus.BUSY):
-        logger.warning(
-            "Instance %s shim is not available",
-            instance.name,
-            extra={"instance_name": instance.name},
-        )
+    elif instance.status.is_available():
         deadline = instance.termination_deadline
         if get_current_datetime() > deadline:
             instance.status = InstanceStatus.TERMINATING
@@ -892,20 +933,30 @@ async def _wait_for_instance_provisioning_data(
 @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
-def _instance_healthcheck(ports: Dict[int, int]) -> HealthStatus:
+def _check_instance_inner(
+    ports: Dict[int, int], *, check_instance_health: bool = False
+) -> InstanceCheck:
+    instance_health_response: Optional[InstanceHealthResponse] = None
     shim_client = runner_client.ShimClient(port=ports[DSTACK_SHIM_HTTP_PORT])
+    method = shim_client.healthcheck
     try:
-        resp = shim_client.healthcheck(unmask_exeptions=True)
-        if resp is None:
-            return HealthStatus(healthy=False, reason="Unknown reason")
-        return runner_client.health_response_to_health_status(resp)
+        healthcheck_response = method(unmask_exceptions=True)
+        if check_instance_health:
+            method = shim_client.get_instance_health
+            instance_health_response = method()
     except requests.RequestException as e:
-        return HealthStatus(healthy=False, reason=f"Can't request shim: {e}")
+        template = "shim.%s(): request error: %s"
+        args = (method.__func__.__name__, e)
+        logger.debug(template, *args)
+        return InstanceCheck(reachable=False, message=template % args)
     except Exception as e:
-        logger.exception("Unknown exception from shim.healthcheck: %s", e)
-        return HealthStatus(
-            healthy=False, reason=f"Unknown exception ({e.__class__.__name__}): {e}"
-        )
+        template = "shim.%s(): unexpected exception %s: %s"
+        args = (method.__func__.__name__, e.__class__.__name__, e)
+        logger.exception(template, *args)
+        return InstanceCheck(reachable=False, message=template % args)
+    return runner_client.healthcheck_response_to_instance_check(
+        healthcheck_response, instance_health_response
+    )
 async def _terminate(instance: InstanceModel) -> None:

dstack/_internal/server/background/tasks/process_probes.py ADDED Viewed

@@ -0,0 +1,164 @@
+from collections.abc import AsyncGenerator
+from contextlib import asynccontextmanager
+from datetime import timedelta
+from functools import partial
+from pathlib import Path
+from tempfile import TemporaryDirectory
+import httpx
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from httpx import AsyncClient, AsyncHTTPTransport
+from sqlalchemy import select, update
+from sqlalchemy.orm import joinedload
+from dstack._internal.core.errors import SSHError
+from dstack._internal.core.models.runs import JobSpec, JobStatus, ProbeSpec
+from dstack._internal.core.services.ssh.tunnel import (
+    SSH_DEFAULT_OPTIONS,
+    IPSocket,
+    SocketPair,
+    UnixSocket,
+)
+from dstack._internal.server.db import get_db, get_session_ctx
+from dstack._internal.server.models import InstanceModel, JobModel, ProbeModel
+from dstack._internal.server.services.locking import get_locker
+from dstack._internal.server.services.logging import fmt
+from dstack._internal.server.services.ssh import container_ssh_tunnel
+from dstack._internal.utils.common import get_current_datetime, get_or_error
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
+BATCH_SIZE = 100
+SSH_CONNECT_TIMEOUT = timedelta(seconds=10)
+PROCESSING_OVERHEAD_TIMEOUT = timedelta(minutes=1)
+PROBES_SCHEDULER = AsyncIOScheduler()
+async def process_probes():
+    probe_lock, probe_lockset = get_locker(get_db().dialect_name).get_lockset(
+        ProbeModel.__tablename__
+    )
+    async with get_session_ctx() as session:
+        async with probe_lock:
+            res = await session.execute(
+                select(ProbeModel.id)
+                .where(ProbeModel.id.not_in(probe_lockset))
+                .where(ProbeModel.active == True)
+                .where(ProbeModel.due <= get_current_datetime())
+                .order_by(ProbeModel.due.asc())
+                .limit(BATCH_SIZE)
+                .with_for_update(skip_locked=True, key_share=True)
+            )
+            probe_ids = res.unique().scalars().all()
+            probe_lockset.update(probe_ids)
+        try:
+            # Refetch to load all attributes.
+            # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
+            res = await session.execute(
+                select(ProbeModel)
+                .where(ProbeModel.id.in_(probe_ids))
+                .options(
+                    joinedload(ProbeModel.job)
+                    .joinedload(JobModel.instance)
+                    .joinedload(InstanceModel.project)
+                )
+                .options(joinedload(ProbeModel.job))
+                .execution_options(populate_existing=True)
+            )
+            probes = res.unique().scalars().all()
+            for probe in probes:
+                if probe.job.status != JobStatus.RUNNING:
+                    probe.active = False
+                else:
+                    job_spec: JobSpec = JobSpec.__response__.parse_raw(probe.job.job_spec_data)
+                    probe_spec = job_spec.probes[probe.probe_num]
+                    # Schedule the next probe execution in case this execution is interrupted
+                    probe.due = get_current_datetime() + _get_probe_async_processing_timeout(
+                        probe_spec
+                    )
+                    # Execute the probe asynchronously outside of the DB session
+                    PROBES_SCHEDULER.add_job(partial(_process_probe_async, probe, probe_spec))
+            await session.commit()
+        finally:
+            probe_lockset.difference_update(probe_ids)
+async def _process_probe_async(probe: ProbeModel, probe_spec: ProbeSpec) -> None:
+    start = get_current_datetime()
+    logger.debug("%s: processing probe", fmt(probe))
+    success = await _execute_probe(probe, probe_spec)
+    async with get_session_ctx() as session:
+        async with get_locker(get_db().dialect_name).lock_ctx(
+            ProbeModel.__tablename__, [probe.id]
+        ):
+            await session.execute(
+                update(ProbeModel)
+                .where(ProbeModel.id == probe.id)
+                .values(
+                    success_streak=0 if not success else ProbeModel.success_streak + 1,
+                    due=get_current_datetime() + timedelta(seconds=probe_spec.interval),
+                )
+            )
+    logger.debug(
+        "%s: probe processing took %ss",
+        fmt(probe),
+        (get_current_datetime() - start).total_seconds(),
+    )
+async def _execute_probe(probe: ProbeModel, probe_spec: ProbeSpec) -> bool:
+    """
+    Returns:
+        Whether probe execution was successful.
+    """
+    try:
+        async with _get_service_replica_client(probe.job) as client:
+            resp = await client.request(
+                method=probe_spec.method,
+                url="http://dstack" + probe_spec.url,
+                headers=[(h.name, h.value) for h in probe_spec.headers],
+                data=probe_spec.body,
+                timeout=probe_spec.timeout,
+                follow_redirects=False,
+            )
+            logger.debug("%s: probe status code: %s", fmt(probe), resp.status_code)
+            return resp.is_success
+    except (SSHError, httpx.RequestError) as e:
+        logger.debug("%s: probe failed: %r", fmt(probe), e)
+        return False
+def _get_probe_async_processing_timeout(probe_spec: ProbeSpec) -> timedelta:
+    return (
+        timedelta(seconds=probe_spec.timeout)
+        + SSH_CONNECT_TIMEOUT
+        + PROCESSING_OVERHEAD_TIMEOUT  # slow db queries and other unforeseen conditions
+    )
+@asynccontextmanager
+async def _get_service_replica_client(job: JobModel) -> AsyncGenerator[AsyncClient, None]:
+    options = {
+        **SSH_DEFAULT_OPTIONS,
+        "ConnectTimeout": str(int(SSH_CONNECT_TIMEOUT.total_seconds())),
+    }
+    job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
+    with TemporaryDirectory() as temp_dir:
+        app_socket_path = (Path(temp_dir) / "replica.sock").absolute()
+        async with container_ssh_tunnel(
+            job=job,
+            forwarded_sockets=[
+                SocketPair(
+                    remote=IPSocket("localhost", get_or_error(job_spec.service_port)),
+                    local=UnixSocket(app_socket_path),
+                ),
+            ],
+            options=options,
+        ):
+            async with AsyncClient(
+                transport=AsyncHTTPTransport(uds=str(app_socket_path))
+            ) as client:
+                yield client

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -42,6 +42,7 @@ from dstack._internal.server.db import get_db, get_session_ctx
 from dstack._internal.server.models import (
     InstanceModel,
     JobModel,
+    ProbeModel,
     ProjectModel,
     RepoModel,
     RunModel,
@@ -414,6 +415,18 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
             )
             job_model.status = JobStatus.TERMINATING
             job_model.termination_reason = JobTerminationReason.GATEWAY_ERROR
+        else:
+            for probe_num in range(len(job.job_spec.probes)):
+                session.add(
+                    ProbeModel(
+                        name=f"{job_model.job_name}-{probe_num}",
+                        job=job_model,
+                        probe_num=probe_num,
+                        due=common_utils.get_current_datetime(),
+                        success_streak=0,
+                        active=True,
+                    )
+                )
     if job_model.status == JobStatus.RUNNING:
         await _check_gpu_utilization(session, job_model, job)

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -23,6 +23,7 @@ from dstack._internal.server.db import get_db, get_session_ctx
 from dstack._internal.server.models import (
     InstanceModel,
     JobModel,
+    ProbeModel,
     ProjectModel,
     RunModel,
     UserModel,
@@ -36,6 +37,7 @@ from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.prometheus.client_metrics import run_metrics
 from dstack._internal.server.services.runs import (
     fmt,
+    is_replica_ready,
     process_terminating_run,
     retry_run_replica_jobs,
     run_model_to_run,
@@ -149,6 +151,11 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
             .joinedload(JobModel.instance)
             .load_only(InstanceModel.fleet_id)
         )
+        .options(
+            selectinload(RunModel.jobs)
+            .joinedload(JobModel.probes)
+            .load_only(ProbeModel.success_streak)
+        )
         .execution_options(populate_existing=True)
     )
     run_model = res.unique().scalar_one()
@@ -472,22 +479,22 @@ async def _handle_run_replicas(
             )
         replicas_to_stop_count = 0
-        # stop any out-of-date replicas that are not running
-        replicas_to_stop_count += len(
-            {
-                j.replica_num
-                for j in run_model.jobs
-                if j.status
-                not in [JobStatus.RUNNING, JobStatus.TERMINATING] + JobStatus.finished_statuses()
-                and j.deployment_num < run_model.deployment_num
-            }
+        # stop any out-of-date replicas that are not ready
+        replicas_to_stop_count += sum(
+            any(j.deployment_num < run_model.deployment_num for j in jobs)
+            and any(
+                j.status not in [JobStatus.TERMINATING] + JobStatus.finished_statuses()
+                for j in jobs
+            )
+            and not is_replica_ready(jobs)
+            for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
         )
-        running_replica_count = len(
-            {j.replica_num for j in run_model.jobs if j.status == JobStatus.RUNNING}
+        ready_replica_count = sum(
+            is_replica_ready(jobs) for _, jobs in group_jobs_by_replica_latest(run_model.jobs)
         )
-        if running_replica_count > run_model.desired_replica_count:
-            # stop excessive running out-of-date replicas
-            replicas_to_stop_count += running_replica_count - run_model.desired_replica_count
+        if ready_replica_count > run_model.desired_replica_count:
+            # stop excessive ready out-of-date replicas
+            replicas_to_stop_count += ready_replica_count - run_model.desired_replica_count
         if replicas_to_stop_count:
             await scale_run_replicas(
                 session,

dstack/_internal/server/migrations/versions/25479f540245_add_probes.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Add probes
+Revision ID: 25479f540245
+Revises: 50dd7ea98639
+Create Date: 2025-08-03 19:51:07.722217
+"""
+import sqlalchemy as sa
+import sqlalchemy_utils
+from alembic import op
+import dstack._internal.server.models
+# revision identifiers, used by Alembic.
+revision = "25479f540245"
+down_revision = "50dd7ea98639"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "probes",
+        sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
+        sa.Column("name", sa.String(length=100), nullable=False),
+        sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
+        sa.Column("probe_num", sa.Integer(), nullable=False),
+        sa.Column("due", dstack._internal.server.models.NaiveDateTime(), nullable=False),
+        sa.Column("success_streak", sa.BigInteger(), nullable=False),
+        sa.Column("active", sa.Boolean(), nullable=False),
+        sa.ForeignKeyConstraint(["job_id"], ["jobs.id"], name=op.f("fk_probes_job_id_jobs")),
+        sa.PrimaryKeyConstraint("id", "job_id", name=op.f("pk_probes")),
+        sa.UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),
+    )
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_table("probes")
+    # ### end Alembic commands ###

dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py ADDED Viewed

@@ -0,0 +1,50 @@
+"""Add instance health
+Revision ID: 728b1488b1b4
+Revises: 25479f540245
+Create Date: 2025-08-01 14:56:20.466990
+"""
+import sqlalchemy as sa
+import sqlalchemy_utils
+from alembic import op
+import dstack._internal.server.models
+# revision identifiers, used by Alembic.
+revision = "728b1488b1b4"
+down_revision = "25479f540245"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    op.create_table(
+        "instance_health_checks",
+        sa.Column("id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False),
+        sa.Column(
+            "instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=False
+        ),
+        sa.Column("collected_at", dstack._internal.server.models.NaiveDateTime(), nullable=False),
+        sa.Column("status", sa.VARCHAR(length=100), nullable=False),
+        sa.Column("response", sa.Text(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["instance_id"],
+            ["instances.id"],
+            name=op.f("fk_instance_health_checks_instance_id_instances"),
+        ),
+        sa.PrimaryKeyConstraint("id", name=op.f("pk_instance_health_checks")),
+    )
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.add_column(sa.Column("health", sa.VARCHAR(length=100), nullable=True))
+    op.execute("UPDATE instances SET health = 'HEALTHY'")
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.alter_column("health", existing_type=sa.VARCHAR(length=100), nullable=False)
+def downgrade() -> None:
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.drop_column("health")
+    op.drop_table("instance_health_checks")

dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl

Potentially problematic release.

dstack 0.19.21py3-none-any.whl → 0.19.23rc1py3-none-any.whl