PyPI - dstack - Versions diffs - 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl - Mend

dstack 0.18.40rc1py3-none-any.whl → 0.18.42py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (104) hide show

dstack/_internal/server/background/tasks/process_metrics.py CHANGED Viewed

@@ -3,18 +3,19 @@ import json
 from typing import Dict, List, Optional
 from sqlalchemy import delete, select
-from sqlalchemy.orm import selectinload
+from sqlalchemy.orm import joinedload
 from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
 from dstack._internal.core.models.runs import JobStatus
 from dstack._internal.server import settings
 from dstack._internal.server.db import get_session_ctx
-from dstack._internal.server.models import JobMetricsPoint, JobModel
+from dstack._internal.server.models import InstanceModel, JobMetricsPoint, JobModel
 from dstack._internal.server.schemas.runner import MetricsResponse
 from dstack._internal.server.services.jobs import get_job_provisioning_data, get_job_runtime_data
+from dstack._internal.server.services.pools import get_instance_ssh_private_keys
 from dstack._internal.server.services.runner import client
 from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
-from dstack._internal.utils.common import batched, get_current_datetime, run_async
+from dstack._internal.utils.common import batched, get_current_datetime, get_or_error, run_async
 from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
@@ -29,14 +30,12 @@ async def collect_metrics():
     async with get_session_ctx() as session:
         res = await session.execute(
             select(JobModel)
-            .where(
-                JobModel.status.in_([JobStatus.RUNNING]),
-            )
-            .options(selectinload(JobModel.project))
+            .where(JobModel.status.in_([JobStatus.RUNNING]))
+            .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
             .order_by(JobModel.last_processed_at.asc())
             .limit(MAX_JOBS_FETCHED)
         )
-        job_models = res.scalars().all()
+        job_models = res.unique().scalars().all()
     for batch in batched(job_models, BATCH_SIZE):
         await _collect_jobs_metrics(batch)
@@ -87,6 +86,7 @@ def _get_recently_collected_metric_cutoff() -> int:
 async def _collect_job_metrics(job_model: JobModel) -> Optional[JobMetricsPoint]:
+    ssh_private_keys = get_instance_ssh_private_keys(get_or_error(job_model.instance))
     jpd = get_job_provisioning_data(job_model)
     jrd = get_job_runtime_data(job_model)
     if jpd is None:
@@ -94,7 +94,7 @@ async def _collect_job_metrics(job_model: JobModel) -> Optional[JobMetricsPoint]
     try:
         res = await run_async(
             _pull_runner_metrics,
-            job_model.project.ssh_private_key,
+            ssh_private_keys,
             jpd,
             jrd,
         )

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -10,7 +10,12 @@ from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT, DSTACK_SHIM_HT
 from dstack._internal.core.errors import GatewayError
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import NetworkMode, RegistryAuth, is_core_model_instance
-from dstack._internal.core.models.instances import InstanceStatus, RemoteConnectionInfo
+from dstack._internal.core.models.configurations import DevEnvironmentConfiguration
+from dstack._internal.core.models.instances import (
+    InstanceStatus,
+    RemoteConnectionInfo,
+    SSHConnectionParams,
+)
 from dstack._internal.core.models.repos import RemoteRepoCreds
 from dstack._internal.core.models.runs import (
     ClusterInfo,
@@ -20,10 +25,12 @@ from dstack._internal.core.models.runs import (
     JobStatus,
     JobTerminationReason,
     Run,
+    RunSpec,
 )
 from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
 from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import (
+    InstanceModel,
     JobModel,
     ProjectModel,
     RepoModel,
@@ -34,11 +41,13 @@ from dstack._internal.server.services import logs as logs_services
 from dstack._internal.server.services import services
 from dstack._internal.server.services.jobs import (
     find_job,
+    get_job_attached_volumes,
     get_job_runtime_data,
     job_model_to_job_submission,
 )
 from dstack._internal.server.services.locking import get_locker
 from dstack._internal.server.services.logging import fmt
+from dstack._internal.server.services.pools import get_instance_ssh_private_keys
 from dstack._internal.server.services.repos import (
     get_code_model,
     get_repo_creds,
@@ -47,7 +56,6 @@ from dstack._internal.server.services.repos import (
 from dstack._internal.server.services.runner import client
 from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
 from dstack._internal.server.services.runs import (
-    get_job_volumes,
     run_model_to_run,
 )
 from dstack._internal.server.services.storage import get_default_storage
@@ -81,7 +89,7 @@ async def _process_next_running_job():
                 .limit(1)
                 .with_for_update(skip_locked=True)
             )
-            job_model = res.scalar()
+            job_model = res.unique().scalar()
             if job_model is None:
                 return
             lockset.add(job_model.id)
@@ -99,10 +107,10 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
     res = await session.execute(
         select(JobModel)
         .where(JobModel.id == job_model.id)
-        .options(joinedload(JobModel.instance))
+        .options(joinedload(JobModel.instance).joinedload(InstanceModel.project))
         .execution_options(populate_existing=True)
     )
-    job_model = res.scalar_one()
+    job_model = res.unique().scalar_one()
     res = await session.execute(
         select(RunModel)
         .where(RunModel.id == job_model.run_id)
@@ -142,25 +150,17 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         job_provisioning_data=job_provisioning_data,
     )
-    volumes = await get_job_volumes(
+    volumes = await get_job_attached_volumes(
         session=session,
         project=project,
         run_spec=run.run_spec,
+        job_num=job.job_spec.job_num,
         job_provisioning_data=job_provisioning_data,
     )
-    server_ssh_private_key = project.ssh_private_key
-    # TODO: Drop this logic and always use project key once it's safe to assume that most on-prem
-    # fleets are (re)created after this change: https://github.com/dstackai/dstack/pull/1716
-    if (
-        job_model.instance is not None
-        and job_model.instance.remote_connection_info is not None
-        and job_provisioning_data.dockerized
-    ):
-        remote_conn_info: RemoteConnectionInfo = RemoteConnectionInfo.__response__.parse_raw(
-            job_model.instance.remote_connection_info
-        )
-        server_ssh_private_key = remote_conn_info.ssh_keys[0].private
+    server_ssh_private_keys = get_instance_ssh_private_keys(
+        common_utils.get_or_error(job_model.instance)
+    )
     secrets = {}  # TODO secrets
@@ -200,11 +200,12 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                     user_ssh_key = ""
                 success = await common_utils.run_async(
                     _process_provisioning_with_shim,
-                    server_ssh_private_key,
+                    server_ssh_private_keys,
                     job_provisioning_data,
                     None,
                     run,
                     job_model,
+                    job_provisioning_data,
                     volumes,
                     secrets,
                     job.job_spec.registry_auth,
@@ -226,7 +227,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                 )
                 success = await common_utils.run_async(
                     _submit_job_to_runner,
-                    server_ssh_private_key,
+                    server_ssh_private_keys,
                     job_provisioning_data,
                     None,
                     run,
@@ -269,7 +270,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
             )
             success = await common_utils.run_async(
                 _process_pulling_with_shim,
-                server_ssh_private_key,
+                server_ssh_private_keys,
                 job_provisioning_data,
                 None,
                 run,
@@ -279,14 +280,14 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
                 code,
                 secrets,
                 repo_creds,
-                server_ssh_private_key,
+                server_ssh_private_keys,
                 job_provisioning_data,
             )
         elif initial_status == JobStatus.RUNNING:
             logger.debug("%s: process running job, age=%s", fmt(job_model), job_submission.age)
             success = await common_utils.run_async(
                 _process_running,
-                server_ssh_private_key,
+                server_ssh_private_keys,
                 job_provisioning_data,
                 job_submission.job_runtime_data,
                 run_model,
@@ -312,8 +313,24 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         and job_model.job_num == 0  # gateway connects only to the first node
         and run.run_spec.configuration.type == "service"
     ):
+        ssh_head_proxy: Optional[SSHConnectionParams] = None
+        ssh_head_proxy_private_key: Optional[str] = None
+        instance = common_utils.get_or_error(job_model.instance)
+        if instance.remote_connection_info is not None:
+            rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info)
+            if rci.ssh_proxy is not None:
+                ssh_head_proxy = rci.ssh_proxy
+                ssh_head_proxy_keys = common_utils.get_or_error(rci.ssh_proxy_keys)
+                ssh_head_proxy_private_key = ssh_head_proxy_keys[0].private
         try:
-            await services.register_replica(session, run_model.gateway_id, run, job_model)
+            await services.register_replica(
+                session,
+                run_model.gateway_id,
+                run,
+                job_model,
+                ssh_head_proxy,
+                ssh_head_proxy_private_key,
+            )
         except GatewayError as e:
             logger.warning(
                 "%s: failed to register service replica: %s, age=%s",
@@ -360,6 +377,7 @@ def _process_provisioning_with_shim(
     ports: Dict[int, int],
     run: Run,
     job_model: JobModel,
+    job_provisioning_data: JobProvisioningData,
     volumes: List[Volume],
     secrets: Dict[str, str],
     registry_auth: Optional[RegistryAuth],
@@ -443,6 +461,7 @@ def _process_provisioning_with_shim(
             host_ssh_user=ssh_user,
             host_ssh_keys=[ssh_key] if ssh_key else [],
             container_ssh_keys=public_keys,
+            instance_id=job_provisioning_data.instance_id,
         )
     else:
         submitted = shim_client.submit(
@@ -459,6 +478,7 @@ def _process_provisioning_with_shim(
             mounts=volume_mounts,
             volumes=volumes,
             instance_mounts=instance_mounts,
+            instance_id=job_provisioning_data.instance_id,
         )
         if not submitted:
             # This can happen when we lost connection to the runner (e.g., network issues), marked
@@ -490,7 +510,7 @@ def _process_pulling_with_shim(
     code: bytes,
     secrets: Dict[str, str],
     repo_credentials: Optional[RemoteRepoCreds],
-    server_ssh_private_key: str,
+    server_ssh_private_keys: tuple[str, Optional[str]],
     job_provisioning_data: JobProvisioningData,
 ) -> bool:
     """
@@ -555,7 +575,7 @@ def _process_pulling_with_shim(
             return True
     return _submit_job_to_runner(
-        server_ssh_private_key,
+        server_ssh_private_keys,
         job_provisioning_data,
         job_runtime_data,
         run=run,
@@ -597,6 +617,7 @@ def _process_running(
         runner_logs=resp.runner_logs,
         job_logs=resp.job_logs,
     )
+    previous_status = job_model.status
     if len(resp.job_states) > 0:
         latest_state_event = resp.job_states[-1]
         latest_status = latest_state_event.state
@@ -612,10 +633,40 @@ def _process_running(
                 )
             if latest_state_event.termination_message:
                 job_model.termination_reason_message = latest_state_event.termination_message
+    else:
+        _terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
+    if job_model.status != previous_status:
         logger.info("%s: now is %s", fmt(job_model), job_model.status.name)
     return True
+def _terminate_if_inactivity_duration_exceeded(
+    run_model: RunModel, job_model: JobModel, no_connections_secs: Optional[int]
+) -> None:
+    conf = RunSpec.__response__.parse_raw(run_model.run_spec).configuration
+    if is_core_model_instance(conf, DevEnvironmentConfiguration) and isinstance(
+        conf.inactivity_duration, int
+    ):
+        logger.debug("%s: no SSH connections for %s seconds", fmt(job_model), no_connections_secs)
+        job_model.inactivity_secs = no_connections_secs
+        if no_connections_secs is None:
+            # TODO(0.19 or earlier): make no_connections_secs required
+            job_model.status = JobStatus.TERMINATING
+            job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
+            job_model.termination_reason_message = (
+                "The selected instance was created before dstack 0.18.41"
+                " and does not support inactivity_duration"
+            )
+        elif no_connections_secs >= conf.inactivity_duration:
+            job_model.status = JobStatus.TERMINATING
+            # TODO(0.19 or earlier): set JobTerminationReason.INACTIVITY_DURATION_EXCEEDED
+            job_model.termination_reason = JobTerminationReason.TERMINATED_BY_SERVER
+            job_model.termination_reason_message = (
+                f"The job was inactive for {no_connections_secs} seconds,"
+                f" exceeding the inactivity_duration of {conf.inactivity_duration} seconds"
+            )
 def _get_cluster_info(
     jobs: List[Job],
     replica_num: int,

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -230,7 +230,8 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
                 # the job is submitted
                 replica_statuses.add(RunStatus.SUBMITTED)
             elif job_model.status == JobStatus.FAILED or (
-                job_model.status == JobStatus.TERMINATING
+                job_model.status
+                in [JobStatus.TERMINATING, JobStatus.TERMINATED, JobStatus.ABORTED]
                 and job_model.termination_reason
                 not in {JobTerminationReason.DONE_BY_RUNNER, JobTerminationReason.SCALED_DOWN}
             ):
@@ -244,17 +245,6 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
                         run_termination_reasons.add(RunTerminationReason.RETRY_LIMIT_EXCEEDED)
                     else:
                         replica_needs_retry = True
-            elif job_model.status in {
-                JobStatus.TERMINATING,
-                JobStatus.TERMINATED,
-                JobStatus.ABORTED,
-            }:
-                # FIXME: This code does not expect JobStatus.TERMINATED status,
-                # so if a job transitions from RUNNING to TERMINATED,
-                # the run will transition to PENDING instead of TERMINATING.
-                # This may not be observed because process_runs is invoked more frequently
-                # than process_terminating_jobs and because most jobs usually transition to FAILED.
-                pass  # unexpected, but let's ignore it
             else:
                 raise ValueError(f"Unexpected job status {job_model.status}")

dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl

dstack 0.18.40rc1py3-none-any.whl → 0.18.42py3-none-any.whl