PyPI - dstack - Versions diffs - 0.18.40rc1__py3-none-any.whl → 0.18.41__py3-none-any.whl - Mend

dstack 0.18.40rc1py3-none-any.whl → 0.18.41py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -10,7 +10,12 @@ from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HT
 from dstack._internal.core.errors import GatewayError
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import NetworkMode, RegistryAuth, is_core_model_instance
-from dstack._internal.core.models.instances import InstanceStatus, RemoteConnectionInfo
+from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
+from dstack._internal.core.models.instances import (
+    InstanceStatus,
+    RemoteConnectionInfo,
+    SSHConnectionParams,
+)
 from dstack._internal.core.models.repos import RemoteRepoCreds
 from dstack._internal.core.models.runs import (
     ClusterInfo,
@@ -20,10 +25,12 @@ from dstack._internal.core.models.runs import (
     JobStatus,
     JobTerminationReason,
     Run,
+    RunSpec,
 )
 from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
 from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import (
+    InstanceModel,
     JobModel,
     ProjectModel,
     RepoModel,
@@ -34,11 +41,13 @@ from dstack._internal.server.services import logs as logs_services
 from dstack._internal.server.services import services
 from dstack._internal.server.services.jobs import (
     find_job,
+    get_job_attached_volumes,
     get_job_runtime_data,
     job_model_to_job_submission,
 )
 from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.logging import fmt
+from dstack._internal.server.services.pools import get_instance_ssh_private_keys
 from dstack._internal.server.services.repos import (
     get_code_model,
     get_repo_creds,
@@ -47,7 +56,6 @@ from dstack._internal.server.services.repos import (
 from dstack._internal.server.services.runner import client
 from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
 from dstack._internal.server.services.runs import (
-    get_job_volumes,
     run_model_to_run,
 )
 from dstack._internal.server.services.storage import get_default_storage
@@ -81,7 +89,7 @@ async def _process_next_running_job():
                 .limit(1)
                 .with_for_update(skip_locked=True)
             )
-            job_model = res.scalar()
+            job_model = res.unique().scalar()
             if job_model is None:
                 return
             lockset.add(job_model.id)
@@ -99,10 +107,10 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
     res = await session.execute(
         select(JobModel)
         .where(JobModel.id == job_model.id)
-        .options(joinedload(JobModel.instance))
+        .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
         .execution_options(populate_existing=True)
     )
-    job_model = res.scalar_one()
+    job_model = res.unique().scalar_one()
     res = await session.execute(
         select(RunModel)
         .where(RunModel.id == job_model.run_id)
@@ -142,25 +150,17 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         job_provisioning_data=job_provisioning_data,
     )
-    volumes = await get_job_volumes(
+    volumes = await get_job_attached_volumes(
         session=session,
         project=project,
         run_spec=run.run_spec,
+        job_num=job.job_spec.job_num,
         job_provisioning_data=job_provisioning_data,
     )
-    server_ssh_private_key = project.ssh_private_key
-    # TODO: Drop this logic and always use project key once it's safe to assume that most on-prem
-    # fleets are (re)created after this change: https://github.com/dstackai/dstack/pull/1716
-    if (
-        job_model.instance is not None
-        and job_model.instance.remote_connection_info is not None
-        and job_provisioning_data.dockerized
-    ):
-        remote_conn_info: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw(
-            job_model.instance.remote_connection_info
-        )
-        server_ssh_private_key = remote_conn_info.ssh_keys[0].private
+    server_ssh_private_keys = get_instance_ssh_private_keys(
+        common_utils.get_or_error(job_model.instance)
+    )
     secrets = {}  # TODO secrets
@@ -200,7 +200,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                     user_ssh_key = ""
                 success = await common_utils.run_async(
                     _process_provisioning_with_shim,
-                    server_ssh_private_key,
+                    server_ssh_private_keys,
                     job_provisioning_data,
                     None,
                     run,
@@ -226,7 +226,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                 )
                 success = await common_utils.run_async(
                     _submit_job_to_runner,
-                    server_ssh_private_key,
+                    server_ssh_private_keys,
                     job_provisioning_data,
                     None,
                     run,
@@ -269,7 +269,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
             )
             success = await common_utils.run_async(
                 _process_pulling_with_shim,
-                server_ssh_private_key,
+                server_ssh_private_keys,
                 job_provisioning_data,
                 None,
                 run,
@@ -279,14 +279,14 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                 code,
                 secrets,
                 repo_creds,
-                server_ssh_private_key,
+                server_ssh_private_keys,
                 job_provisioning_data,
             )
         elif initial_status == JobStatus.RUNNING:
             logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
             success = await common_utils.run_async(
                 _process_running,
-                server_ssh_private_key,
+                server_ssh_private_keys,
                 job_provisioning_data,
                 job_submission.job_runtime_data,
                 run_model,
@@ -312,8 +312,24 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         and job_model.job_num == 0  # gateway connects only to the first node
         and run.run_spec.configuration.type == "service"
     ):
+        ssh_head_proxy: Optional[SSHConnectionParams] = None
+        ssh_head_proxy_private_key: Optional[str] = None
+        instance = common_utils.get_or_error(job_model.instance)
+        if instance.remote_connection_info is not None:
+            rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info)
+            if rci.ssh_proxy is not None:
+                ssh_head_proxy = rci.ssh_proxy
+                ssh_head_proxy_keys = common_utils.get_or_error(rci.ssh_proxy_keys)
+                ssh_head_proxy_private_key = ssh_head_proxy_keys[0].private
         try:
-            await services.register_replica(session, run_model.gateway_id, run, job_model)
+            await services.register_replica(
+                session,
+                run_model.gateway_id,
+                run,
+                job_model,
+                ssh_head_proxy,
+                ssh_head_proxy_private_key,
+            )
         except GatewayError as e:
             logger.warning(
                 "%s: failed to register service replica: %s, age=%s",
@@ -490,7 +506,7 @@ def _process_pulling_with_shim(
     code: bytes,
     secrets: Dict[str, str],
     repo_credentials: Optional[RemoteRepoCreds],
-    server_ssh_private_key: str,
+    server_ssh_private_keys: tuple[str, Optional[str]],
     job_provisioning_data: JobProvisioningData,
 ) -> bool:
     """
@@ -555,7 +571,7 @@ def _process_pulling_with_shim(
             return True
     return _submit_job_to_runner(
-        server_ssh_private_key,
+        server_ssh_private_keys,
         job_provisioning_data,
         job_runtime_data,
         run=run,
@@ -597,6 +613,7 @@ def _process_running(
         runner_logs=resp.runner_logs,
         job_logs=resp.job_logs,
     )
+    previous_status = job_model.status
     if len(resp.job_states) > 0:
         latest_state_event = resp.job_states[-1]
         latest_status = latest_state_event.state
@@ -612,10 +629,40 @@ def _process_running(
                 )
             if latest_state_event.termination_message:
                 job_model.termination_reason_message = latest_state_event.termination_message
+    else:
+        _terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
+    if job_model.status != previous_status:
         logger.info("%s: now is %s", fmt(job_model), job_model.status.name)
     return True
+def _terminate_if_inactivity_duration_exceeded(
+    run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
+) -> None:
+    conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
+    if is_core_model_instance(conf, DevEnvironmentConfiguration) and isinstance(
+        conf.inactivity_duration, int
+    ):
+        logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
+        job_model.inactivity_secs = no_connections_secs
+        if no_connections_secs is None:
+            # TODO(0.19 or earlier): make no_connections_secs required
+            job_model.status = JobStatus.TERMINATING
+            job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
+            job_model.termination_reason_message = (
+                "The selected instance was created before dstack 0.18.41"
+                " and does not support inactivity_duration"
+            )
+        elif no_connections_secs >= conf.inactivity_duration:
+            job_model.status = JobStatus.TERMINATING
+            # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
+            job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
+            job_model.termination_reason_message = (
+                f"The job was inactive for {no_connections_secs} seconds,"
+                f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
+            )
 def _get_cluster_info(
     jobs: List[Job],
     replica_num: int,

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -230,7 +230,8 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
                 # the job is submitted
                 replica_statuses.add(RunStatus.SUBMITTED)
             elif job_model.status == JobStatus.FAILED or (
-                job_model.status == JobStatus.TERMINATING
+                job_model.status
+                in [JobStatus.TERMINATING, JobStatus.TERMINATED, JobStatus.ABORTED]
                 and job_model.termination_reason
                 not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN}
             ):
@@ -244,17 +245,6 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
                         run_termination_reasons.add(RunTerminationReason.RETRY_LIMIT_EXCEEDED)
                     else:
                         replica_needs_retry = True
-            elif job_model.status in {
-                JobStatus.TERMINATING,
-                JobStatus.TERMINATED,
-                JobStatus.ABORTED,
-            }:
-                # FIXME: This code does not expect JobStatus.TERMINATED status,
-                # so if a job transitions from RUNNING to TERMINATED,
-                # the run will transition to PENDING instead of TERMINATING.
-                # This may not be observed because process_runs is invoked more frequently
-                # than process_terminating_jobs and because most jobs usually transition to FAILED.
-                pass  # unexpected, but let's ignore it
             else:
                 raise ValueError(f"Unexpected job status {job_model.status}")

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -15,10 +15,7 @@ from dstack._internal.core.models.fleets import (
     FleetStatus,
     InstanceGroupPlacement,
 )
-from dstack._internal.core.models.instances import (
-    InstanceOfferWithAvailability,
-    InstanceStatus,
-)
+from dstack._internal.core.models.instances import InstanceOfferWithAvailability, InstanceStatus
 from dstack._internal.core.models.profiles import (
     DEFAULT_POOL_NAME,
     DEFAULT_RUN_TERMINATION_IDLE_TIME,
@@ -26,6 +23,7 @@ from dstack._internal.core.models.profiles import (
     Profile,
     TerminationPolicy,
 )
+from dstack._internal.core.models.resources import Memory
 from dstack._internal.core.models.runs import (
     Job,
     JobProvisioningData,
@@ -52,28 +50,31 @@ from dstack._internal.server.services.fleets import (
     fleet_model_to_fleet,
 )
 from dstack._internal.server.services.jobs import (
+    check_can_attach_job_volumes,
     find_job,
     get_instances_ids_with_detaching_volumes,
+    get_job_configured_volume_models,
+    get_job_configured_volumes,
+    get_job_runtime_data,
 )
 from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.logging import fmt
 from dstack._internal.server.services.offers import get_offers_by_requirements
 from dstack._internal.server.services.pools import (
     filter_pool_instances,
+    get_instance_offer,
     get_instance_provisioning_data,
+    get_shared_pool_instances_with_offers,
 )
 from dstack._internal.server.services.runs import (
-    check_can_attach_run_volumes,
     check_run_spec_requires_instance_mounts,
-    get_offer_volumes,
-    get_run_volume_models,
-    get_run_volumes,
     run_model_to_run,
 )
 from dstack._internal.server.services.volumes import (
     volume_model_to_volume,
 )
 from dstack._internal.utils import common as common_utils
+from dstack._internal.utils import env as env_utils
 from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
@@ -152,17 +153,21 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
             await session.commit()
             return
     try:
-        volume_models = await get_run_volume_models(
+        volume_models = await get_job_configured_volume_models(
             session=session,
             project=project,
             run_spec=run_spec,
+            job_num=job.job_spec.job_num,
+            job_spec=job.job_spec,
         )
-        volumes = await get_run_volumes(
+        volumes = await get_job_configured_volumes(
             session=session,
             project=project,
             run_spec=run_spec,
+            job_num=job.job_spec.job_num,
+            job_spec=job.job_spec,
         )
-        check_can_attach_run_volumes(run_spec=run_spec, volumes=volumes)
+        check_can_attach_job_volumes(volumes)
     except ServerClientError as e:
         logger.warning("%s: failed to prepare run volumes: %s", fmt(job_model), repr(e))
         job_model.status = JobStatus.TERMINATING
@@ -186,12 +191,12 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
             .where(
                 InstanceModel.pool_id == pool.id,
                 InstanceModel.deleted == False,
-                InstanceModel.job_id.is_(None),
+                InstanceModel.total_blocks > InstanceModel.busy_blocks,
             )
-            .options(lazyload(InstanceModel.job))
+            .options(lazyload(InstanceModel.jobs))
             .with_for_update()
         )
-        pool_instances = list(res.scalars().all())
+        pool_instances = list(res.unique().scalars().all())
         instances_ids = sorted([i.id for i in pool_instances])
         if get_db().dialect_name == "sqlite":
             # Start new transaction to see commited changes after lock
@@ -202,14 +207,16 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
             detaching_instances_ids = await get_instances_ids_with_detaching_volumes(session)
             # Refetch after lock
             res = await session.execute(
-                select(InstanceModel).where(
+                select(InstanceModel)
+                .where(
                     InstanceModel.id.not_in(detaching_instances_ids),
                     InstanceModel.id.in_(instances_ids),
                     InstanceModel.deleted == False,
-                    InstanceModel.job_id.is_(None),
+                    InstanceModel.total_blocks > InstanceModel.busy_blocks,
                 )
+                .execution_options(populate_existing=True)
             )
-            pool_instances = list(res.scalars().all())
+            pool_instances = list(res.unique().scalars().all())
             instance = await _assign_job_to_pool_instance(
                 session=session,
                 pool_instances=pool_instances,
@@ -221,8 +228,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
                 volumes=volumes,
             )
             job_model.instance_assigned = True
-            if instance is not None:
-                job_model.job_runtime_data = _prepare_job_runtime_data(job, instance).json()
             job_model.last_processed_at = common_utils.get_current_datetime()
             await session.commit()
             return
@@ -234,7 +239,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
             .options(selectinload(InstanceModel.volumes))
             .execution_options(populate_existing=True)
         )
-        instance = res.scalar_one()
+        instance = res.unique().scalar_one()
         job_model.status = JobStatus.PROVISIONING
     else:
         # Assigned no instance, create a new one
@@ -290,7 +295,7 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
             offer=offer,
             instance_num=instance_num,
         )
-        job_model.job_runtime_data = _prepare_job_runtime_data(job, instance).json()
+        job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
         instance.fleet_id = fleet_model.id
         logger.info(
             "The job %s created the new instance %s",
@@ -351,21 +356,40 @@ async def _assign_job_to_pool_instance(
     master_job_provisioning_data: Optional[JobProvisioningData] = None,
     volumes: Optional[List[List[Volume]]] = None,
 ) -> Optional[InstanceModel]:
+    instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]]
     profile = run_spec.merged_profile
-    relevant_instances = filter_pool_instances(
+    multinode = job.job_spec.jobs_per_replica > 1
+    nonshared_instances = filter_pool_instances(
         pool_instances=pool_instances,
         profile=profile,
         requirements=job.job_spec.requirements,
         status=InstanceStatus.IDLE,
         fleet_model=fleet_model,
-        multinode=job.job_spec.jobs_per_replica > 1,
+        multinode=multinode,
         master_job_provisioning_data=master_job_provisioning_data,
         volumes=volumes,
+        shared=False,
     )
-    if len(relevant_instances) == 0:
+    instances_with_offers = [
+        (instance, common_utils.get_or_error(get_instance_offer(instance)))
+        for instance in nonshared_instances
+    ]
+    if not multinode:
+        shared_instances_with_offers = get_shared_pool_instances_with_offers(
+            pool_instances=pool_instances,
+            profile=profile,
+            requirements=job.job_spec.requirements,
+            idle_only=True,
+            fleet_model=fleet_model,
+            volumes=volumes,
+        )
+        instances_with_offers.extend(shared_instances_with_offers)
+    if len(instances_with_offers) == 0:
         return None
-    sorted_instances = sorted(relevant_instances, key=lambda instance: instance.price)
-    instance = sorted_instances[0]
+    instances_with_offers.sort(key=lambda instance_with_offer: instance_with_offer[0].price or 0)
+    instance, offer = instances_with_offers[0]
     # Reload InstanceModel with volumes
     res = await session.execute(
         select(InstanceModel)
@@ -374,7 +398,8 @@ async def _assign_job_to_pool_instance(
     )
     instance = res.unique().scalar_one()
     instance.status = InstanceStatus.BUSY
-    instance.job = job_model
+    instance.busy_blocks += offer.blocks
     logger.info(
         "The job %s switched instance %s status to BUSY",
         job_model.job_name,
@@ -385,8 +410,10 @@ async def _assign_job_to_pool_instance(
         },
     )
     logger.info("%s: now is provisioning on '%s'", fmt(job_model), instance.name)
-    job_model.job_provisioning_data = instance.job_provisioning_data
+    job_model.instance = instance
     job_model.used_instance_id = instance.id
+    job_model.job_provisioning_data = instance.job_provisioning_data
+    job_model.job_runtime_data = _prepare_job_runtime_data(offer).json()
     return instance
@@ -431,7 +458,7 @@ async def _run_job_on_new_instance(
             offer.region,
             offer.price,
         )
-        offer_volumes = get_offer_volumes(volumes, offer)
+        offer_volumes = _get_offer_volumes(volumes, offer)
         try:
             job_provisioning_data = await common_utils.run_async(
                 backend.compute().run_job,
@@ -549,29 +576,64 @@ def _create_instance_model_for_job(
         offer=offer.json(),
         termination_policy=termination_policy,
         termination_idle_time=termination_idle_time,
-        job=job_model,
+        jobs=[job_model],
         backend=offer.backend,
         price=offer.price,
         region=offer.region,
         volumes=[],
+        total_blocks=1,
+        busy_blocks=1,
     )
     return instance
-def _prepare_job_runtime_data(job: Job, instance: InstanceModel) -> JobRuntimeData:
-    if job.job_spec.jobs_per_replica > 1:
-        # multi-node runs require host network mode for inter-node communication and occupy
-        # the entire instance
-        return JobRuntimeData(network_mode=NetworkMode.HOST)
-    # TODO: replace with a real computed value depending on the instance
-    is_shared_instance = True
+def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
+    if offer.total_blocks == 1:
+        if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
+            network_mode = NetworkMode.BRIDGE
+        else:
+            network_mode = NetworkMode.HOST
+        return JobRuntimeData(
+            network_mode=network_mode,
+            offer=offer,
+        )
+    return JobRuntimeData(
+        network_mode=NetworkMode.BRIDGE,
+        offer=offer,
+        cpu=offer.instance.resources.cpus,
+        gpu=len(offer.instance.resources.gpus),
+        memory=Memory(offer.instance.resources.memory_mib / 1024),
+    )
-    if not is_shared_instance:
-        return JobRuntimeData(network_mode=NetworkMode.HOST)
-    # TODO: slice CPU/GPU/Memory resources depending on the instance
-    return JobRuntimeData(network_mode=NetworkMode.BRIDGE)
+def _get_offer_volumes(
+    volumes: List[List[Volume]],
+    offer: InstanceOfferWithAvailability,
+) -> List[Volume]:
+    """
+    Returns volumes suitable for the offer for each mount point.
+    """
+    offer_volumes = []
+    for mount_point_volumes in volumes:
+        offer_volumes.append(_get_offer_mount_point_volume(mount_point_volumes, offer))
+    return offer_volumes
+def _get_offer_mount_point_volume(
+    volumes: List[Volume],
+    offer: InstanceOfferWithAvailability,
+) -> Volume:
+    """
+    Returns the first suitable volume for the offer among possible mount point volumes.
+    """
+    for volume in volumes:
+        if (
+            volume.configuration.backend != offer.backend
+            or volume.configuration.region != offer.region
+        ):
+            continue
+        return volume
+    raise ServerClientError("Failed to find an eligible volume for the mount point")
 async def _attach_volumes(
@@ -586,6 +648,8 @@ async def _attach_volumes(
         project=project,
         backend_type=job_provisioning_data.backend,
     )
+    job_runtime_data = common_utils.get_or_error(get_job_runtime_data(job_model))
+    job_runtime_data.volume_names = []
     logger.info("Attaching volumes: %s", [[v.name for v in vs] for vs in volume_models])
     for mount_point_volume_models in volume_models:
         for volume_model in mount_point_volume_models:
@@ -604,6 +668,7 @@ async def _attach_volumes(
                         instance=instance,
                         instance_id=job_provisioning_data.instance_id,
                     )
+                    job_runtime_data.volume_names.append(volume.name)
                     break  # attach next mount point
             except (ServerClientError, BackendError) as e:
                 logger.warning("%s: failed to attached volume: %s", fmt(job_model), repr(e))
@@ -620,6 +685,8 @@ async def _attach_volumes(
                 # TODO: Replace with JobTerminationReason.VOLUME_ERROR in 0.19
                 job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
                 job_model.termination_reason_message = "Failed to attach volume"
+            finally:
+                job_model.job_runtime_data = job_runtime_data.json()
 async def _attach_volume(

dstack/_internal/server/background/tasks/process_terminating_jobs.py CHANGED Viewed

@@ -52,7 +52,7 @@ async def _process_next_terminating_job():
                         InstanceModel.id == job_model.used_instance_id,
                         InstanceModel.id.not_in(instance_lockset),
                     )
-                    .options(lazyload(InstanceModel.job))
+                    .options(lazyload(InstanceModel.jobs))
                     .with_for_update(skip_locked=True)
                 )
                 instance_model = res.scalar()

dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Reverse Job-Instance relationship
+Revision ID: 1338b788b612
+Revises: 51d45659d574
+Create Date: 2025-01-16 14:59:19.113534
+"""
+import sqlalchemy as sa
+import sqlalchemy_utils
+from alembic import op
+# revision identifiers, used by Alembic.
+revision = "1338b788b612"
+down_revision = "51d45659d574"
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.add_column(
+            sa.Column(
+                "instance_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True
+            )
+        )
+        batch_op.create_foreign_key(
+            batch_op.f("fk_jobs_instance_id_instances"),
+            "instances",
+            ["instance_id"],
+            ["id"],
+            ondelete="CASCADE",
+        )
+    op.execute("""
+        UPDATE jobs AS j
+        SET instance_id = (
+            SELECT i.id
+            FROM instances AS i
+            WHERE i.job_id = j.id
+        )
+    """)
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.drop_constraint("fk_instances_job_id_jobs", type_="foreignkey")
+        batch_op.drop_column("job_id")
+def downgrade() -> None:
+    with op.batch_alter_table("instances", schema=None) as batch_op:
+        batch_op.add_column(
+            sa.Column("job_id", sqlalchemy_utils.types.uuid.UUIDType(binary=False), nullable=True)
+        )
+        batch_op.create_foreign_key("fk_instances_job_id_jobs", "jobs", ["job_id"], ["id"])
+    # This migration is not fully reversible - we cannot assign multiple jobs to a single instance,
+    # thus LIMIT 1
+    op.execute("""
+        UPDATE instances AS i
+        SET job_id = (
+            SELECT j.id
+            FROM jobs j
+            WHERE j.instance_id = i.id
+            ORDER by j.submitted_at DESC
+            LIMIT 1
+        )
+    """)
+    with op.batch_alter_table("jobs", schema=None) as batch_op:
+        batch_op.drop_constraint(batch_op.f("fk_jobs_instance_id_instances"), type_="foreignkey")
+        batch_op.drop_column("instance_id")

dstack 0.18.40rc1__py3-none-any.whl → 0.18.41__py3-none-any.whl

dstack 0.18.40rc1py3-none-any.whl → 0.18.41py3-none-any.whl