PyPI - dstack - Versions diffs - 0.19.11rc1__py3-none-any.whl → 0.19.12__py3-none-any.whl - Mend

dstack 0.19.11rc1py3-none-any.whl → 0.19.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (45) hide show

dstack/_internal/cli/commands/offer.py +2 -0
dstack/_internal/cli/services/configurators/run.py +43 -42
dstack/_internal/cli/utils/run.py +10 -26
dstack/_internal/cli/utils/updates.py +13 -1
dstack/_internal/core/backends/aws/compute.py +21 -9
dstack/_internal/core/backends/base/compute.py +7 -3
dstack/_internal/core/backends/gcp/compute.py +43 -20
dstack/_internal/core/backends/gcp/resources.py +18 -2
dstack/_internal/core/backends/local/compute.py +4 -2
dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
dstack/_internal/core/backends/template/models.py.jinja +4 -0
dstack/_internal/core/models/configurations.py +1 -1
dstack/_internal/core/models/fleets.py +6 -1
dstack/_internal/core/models/profiles.py +43 -3
dstack/_internal/core/models/repos/local.py +19 -13
dstack/_internal/core/models/runs.py +78 -45
dstack/_internal/server/background/tasks/process_running_jobs.py +47 -12
dstack/_internal/server/background/tasks/process_runs.py +14 -1
dstack/_internal/server/background/tasks/process_submitted_jobs.py +3 -3
dstack/_internal/server/routers/repos.py +9 -4
dstack/_internal/server/services/fleets.py +2 -2
dstack/_internal/server/services/gateways/__init__.py +1 -1
dstack/_internal/server/services/jobs/__init__.py +4 -4
dstack/_internal/server/services/plugins.py +64 -32
dstack/_internal/server/services/runner/client.py +4 -1
dstack/_internal/server/services/runs.py +2 -2
dstack/_internal/server/services/volumes.py +1 -1
dstack/_internal/server/statics/index.html +1 -1
dstack/_internal/server/statics/{main-b4803049eac16aea9a49.js → main-b0e80f8e26a168c129e9.js} +72 -25
dstack/_internal/server/statics/{main-b4803049eac16aea9a49.js.map → main-b0e80f8e26a168c129e9.js.map} +1 -1
dstack/_internal/server/testing/common.py +2 -1
dstack/_internal/utils/common.py +4 -0
dstack/api/server/_fleets.py +5 -1
dstack/api/server/_runs.py +8 -0
dstack/plugins/builtin/__init__.py +0 -0
dstack/plugins/builtin/rest_plugin/__init__.py +18 -0
dstack/plugins/builtin/rest_plugin/_models.py +48 -0
dstack/plugins/builtin/rest_plugin/_plugin.py +127 -0
dstack/version.py +1 -1
{dstack-0.19.11rc1.dist-info → dstack-0.19.12.dist-info}/METADATA +2 -2
{dstack-0.19.11rc1.dist-info → dstack-0.19.12.dist-info}/RECORD +44 -41
dstack/_internal/utils/ignore.py +0 -92
{dstack-0.19.11rc1.dist-info → dstack-0.19.12.dist-info}/WHEEL +0 -0
{dstack-0.19.11rc1.dist-info → dstack-0.19.12.dist-info}/entry_points.txt +0 -0
{dstack-0.19.11rc1.dist-info → dstack-0.19.12.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/core/models/profiles.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing_extensions import Annotated, Literal
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import CoreModel, Duration
+from dstack._internal.utils.common import list_enum_values_for_annotation
 from dstack._internal.utils.tags import tags_validator
 DEFAULT_RETRY_DURATION = 3600
@@ -32,6 +33,17 @@ class TerminationPolicy(str, Enum):
     DESTROY_AFTER_IDLE = "destroy-after-idle"
+class StartupOrder(str, Enum):
+    ANY = "any"
+    MASTER_FIRST = "master-first"
+    WORKERS_FIRST = "workers-first"
+class StopCriteria(str, Enum):
+    ALL_DONE = "all-done"
+    MASTER_DONE = "master-done"
 @overload
 def parse_duration(v: None) -> None: ...
@@ -102,7 +114,7 @@ class ProfileRetry(CoreModel):
         Field(
             description=(
                 "The list of events that should be handled with retry."
-                " Supported events are `no-capacity`, `interruption`, and `error`."
+                f" Supported events are {list_enum_values_for_annotation(RetryEvent)}."
                 " Omit to retry on all events"
             )
         ),
@@ -190,7 +202,11 @@ class ProfileParams(CoreModel):
     spot_policy: Annotated[
         Optional[SpotPolicy],
         Field(
-            description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`. Defaults to `on-demand`"
+            description=(
+                "The policy for provisioning spot or on-demand instances:"
+                f" {list_enum_values_for_annotation(SpotPolicy)}."
+                f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
+            )
         ),
     ] = None
     retry: Annotated[
@@ -225,7 +241,11 @@ class ProfileParams(CoreModel):
     creation_policy: Annotated[
         Optional[CreationPolicy],
         Field(
-            description="The policy for using instances from fleets. Defaults to `reuse-or-create`"
+            description=(
+                "The policy for using instances from fleets:"
+                f" {list_enum_values_for_annotation(CreationPolicy)}."
+                f" Defaults to `{CreationPolicy.REUSE_OR_CREATE.value}`"
+            )
         ),
     ] = None
     idle_duration: Annotated[
@@ -241,6 +261,26 @@ class ProfileParams(CoreModel):
         Optional[UtilizationPolicy],
         Field(description="Run termination policy based on utilization"),
     ] = None
+    startup_order: Annotated[
+        Optional[StartupOrder],
+        Field(
+            description=(
+                f"The order in which master and workers jobs are started:"
+                f" {list_enum_values_for_annotation(StartupOrder)}."
+                f" Defaults to `{StartupOrder.ANY.value}`"
+            )
+        ),
+    ] = None
+    stop_criteria: Annotated[
+        Optional[StopCriteria],
+        Field(
+            description=(
+                "The criteria determining when a multi-node run should be considered finished:"
+                f" {list_enum_values_for_annotation(StopCriteria)}."
+                f" Defaults to `{StopCriteria.ALL_DONE.value}`"
+            )
+        ),
+    ] = None
     fleets: Annotated[
         Optional[list[str]], Field(description="The fleets considered for reuse")
     ] = None

dstack/_internal/core/models/repos/local.py CHANGED Viewed

@@ -2,13 +2,18 @@ import tarfile
 from pathlib import Path
 from typing import BinaryIO, Optional
+import ignore
+import ignore.overrides
 from typing_extensions import Literal
 from dstack._internal.core.models.repos.base import BaseRepoInfo, Repo
+from dstack._internal.utils.common import sizeof_fmt
 from dstack._internal.utils.hash import get_sha256, slugify
-from dstack._internal.utils.ignore import GitIgnore
+from dstack._internal.utils.logging import get_logger
 from dstack._internal.utils.path import PathLike
+logger = get_logger(__name__)
 class LocalRepoInfo(BaseRepoInfo):
     repo_type: Literal["local"] = "local"
@@ -69,22 +74,23 @@ class LocalRepo(Repo):
         self.run_repo_data = repo_data
     def write_code_file(self, fp: BinaryIO) -> str:
+        repo_path = Path(self.run_repo_data.repo_dir)
         with tarfile.TarFile(mode="w", fileobj=fp) as t:
-            t.add(
-                self.run_repo_data.repo_dir,
-                arcname="",
-                filter=TarIgnore(self.run_repo_data.repo_dir, globs=[".git"]),
-            )
+            for entry in (
+                ignore.WalkBuilder(repo_path)
+                .overrides(ignore.overrides.OverrideBuilder(repo_path).add("!/.git/").build())
+                .hidden(False)  # do not ignore files that start with a dot
+                .require_git(False)  # respect git ignore rules even if not a git repo
+                .add_custom_ignore_filename(".dstackignore")
+                .build()
+            ):
+                entry_path_within_repo = entry.path().relative_to(repo_path)
+                if entry_path_within_repo != Path("."):
+                    t.add(entry.path(), arcname=entry_path_within_repo, recursive=False)
+        logger.debug("Code file size: %s", sizeof_fmt(fp.tell()))
         return get_sha256(fp)
     def get_repo_info(self) -> LocalRepoInfo:
         return LocalRepoInfo(
             repo_dir=self.run_repo_data.repo_dir,
         )
-class TarIgnore(GitIgnore):
-    def __call__(self, tarinfo: tarfile.TarInfo) -> Optional[tarfile.TarInfo]:
-        if self.ignore(tarinfo.path):
-            return None
-        return tarinfo

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -148,9 +148,6 @@ class JobTerminationReason(str, Enum):
         }
         return mapping[self]
-    def pretty_repr(self) -> str:
-        return " ".join(self.value.split("_")).capitalize()
 class Requirements(CoreModel):
     # TODO: Make requirements' fields required
@@ -289,6 +286,9 @@ class JobSubmission(CoreModel):
     exit_status: Optional[int]
     job_provisioning_data: Optional[JobProvisioningData]
     job_runtime_data: Optional[JobRuntimeData]
+    # TODO: make status_message and error a computed field after migrating to pydanticV2
+    status_message: Optional[str]
+    error: Optional[str] = None
     @property
     def age(self) -> timedelta:
@@ -301,6 +301,71 @@ class JobSubmission(CoreModel):
             end_time = self.finished_at
         return end_time - self.submitted_at
+    @root_validator
+    def _status_message(cls, values) -> Dict:
+        try:
+            status = values["status"]
+            termination_reason = values["termination_reason"]
+            exit_code = values["exit_status"]
+        except KeyError:
+            return values
+        values["status_message"] = JobSubmission._get_status_message(
+            status=status,
+            termination_reason=termination_reason,
+            exit_status=exit_code,
+        )
+        return values
+    @staticmethod
+    def _get_status_message(
+        status: JobStatus,
+        termination_reason: Optional[JobTerminationReason],
+        exit_status: Optional[int],
+    ) -> str:
+        if status == JobStatus.DONE:
+            return "exited (0)"
+        elif status == JobStatus.FAILED:
+            if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
+                return f"exited ({exit_status})"
+            elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
+                return "no offers"
+            elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
+                return "interrupted"
+            else:
+                return "error"
+        elif status == JobStatus.TERMINATED:
+            if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
+                return "stopped"
+            elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
+                return "aborted"
+        return status.value
+    @root_validator
+    def _error(cls, values) -> Dict:
+        try:
+            termination_reason = values["termination_reason"]
+        except KeyError:
+            return values
+        values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
+        return values
+    @staticmethod
+    def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
+        error_mapping = {
+            JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
+            JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
+            JobTerminationReason.VOLUME_ERROR: "waiting runner limit exceeded",
+            JobTerminationReason.GATEWAY_ERROR: "gateway error",
+            JobTerminationReason.SCALED_DOWN: "scaled down",
+            JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
+            JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
+            JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
+            JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
+            JobTerminationReason.EXECUTOR_ERROR: "executor error",
+            JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
+        }
+        return error_mapping.get(termination_reason)
 class Job(CoreModel):
     job_spec: JobSpec
@@ -445,15 +510,20 @@ class Run(CoreModel):
     def _error(cls, values) -> Dict:
         try:
             termination_reason = values["termination_reason"]
-            jobs = values["jobs"]
         except KeyError:
             return values
-        values["error"] = _get_run_error(
-            run_termination_reason=termination_reason,
-            run_jobs=jobs,
-        )
+        values["error"] = Run._get_error(termination_reason=termination_reason)
         return values
+    @staticmethod
+    def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
+        if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
+            return "retry limit exceeded"
+        elif termination_reason == RunTerminationReason.SERVER_ERROR:
+            return "server error"
+        else:
+            return None
 class JobPlan(CoreModel):
     job_spec: JobSpec
@@ -502,40 +572,3 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
         SpotPolicy.ONDEMAND: False,
     }
     return policy_map[spot_policy]
-def _get_run_error(
-    run_termination_reason: Optional[RunTerminationReason],
-    run_jobs: List[Job],
-) -> str:
-    if run_termination_reason is None:
-        return ""
-    if len(run_jobs) > 1:
-        return run_termination_reason.name
-    run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
-        run_jobs
-    )
-    # For failed runs, also show termination reason to provide more context.
-    # For other run statuses, the job termination reason will duplicate run status.
-    if run_job_termination_reason is not None and run_termination_reason in [
-        RunTerminationReason.JOB_FAILED,
-        RunTerminationReason.SERVER_ERROR,
-        RunTerminationReason.RETRY_LIMIT_EXCEEDED,
-    ]:
-        if exit_status:
-            return (
-                f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
-            )
-        return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
-    return run_termination_reason.name
-def _get_run_job_termination_reason_and_exit_status(
-    run_jobs: List[Job],
-) -> tuple[Optional[JobTerminationReason], Optional[int]]:
-    for job in run_jobs:
-        if len(job.job_submissions) > 0:
-            job_submission = job.job_submissions[-1]
-            if job_submission.termination_reason is not None:
-                return job_submission.termination_reason, job_submission.exit_status
-    return None, None

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
     SSHConnectionParams,
 )
 from dstack._internal.core.models.metrics import Metric
+from dstack._internal.core.models.profiles import StartupOrder
 from dstack._internal.core.models.repos import RemoteRepoCreds
 from dstack._internal.core.models.runs import (
     ClusterInfo,
@@ -184,18 +185,10 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         if job_provisioning_data.hostname is None:
             await _wait_for_instance_provisioning_data(job_model=job_model)
         else:
-            # Wait until all other jobs in the replica have IPs assigned.
-            # This is needed to ensure cluster_info has all IPs set.
-            for other_job in run.jobs:
-                if (
-                    other_job.job_spec.replica_num == job.job_spec.replica_num
-                    and other_job.job_submissions[-1].status == JobStatus.PROVISIONING
-                    and other_job.job_submissions[-1].job_provisioning_data is not None
-                    and other_job.job_submissions[-1].job_provisioning_data.hostname is None
-                ):
-                    job_model.last_processed_at = common_utils.get_current_datetime()
-                    await session.commit()
-                    return
+            if _should_wait_for_other_nodes(run, job, job_model):
+                job_model.last_processed_at = common_utils.get_current_datetime()
+                await session.commit()
+                return
             # fails are acceptable until timeout is exceeded
             if job_provisioning_data.dockerized:
@@ -406,6 +399,48 @@ async def _wait_for_instance_provisioning_data(job_model: JobModel):
     job_model.job_provisioning_data = job_model.instance.job_provisioning_data
+def _should_wait_for_other_nodes(run: Run, job: Job, job_model: JobModel) -> bool:
+    for other_job in run.jobs:
+        if (
+            other_job.job_spec.replica_num == job.job_spec.replica_num
+            and other_job.job_submissions[-1].status == JobStatus.PROVISIONING
+            and other_job.job_submissions[-1].job_provisioning_data is not None
+            and other_job.job_submissions[-1].job_provisioning_data.hostname is None
+        ):
+            logger.debug(
+                "%s: waiting for other job to have IP assigned",
+                fmt(job_model),
+            )
+            return True
+    master_job = find_job(run.jobs, job.job_spec.replica_num, 0)
+    if (
+        job.job_spec.job_num != 0
+        and run.run_spec.merged_profile.startup_order == StartupOrder.MASTER_FIRST
+        and master_job.job_submissions[-1].status != JobStatus.RUNNING
+    ):
+        logger.debug(
+            "%s: waiting for master job to become running",
+            fmt(job_model),
+        )
+        return True
+    if (
+        job.job_spec.job_num == 0
+        and run.run_spec.merged_profile.startup_order == StartupOrder.WORKERS_FIRST
+    ):
+        for other_job in run.jobs:
+            if (
+                other_job.job_spec.replica_num == job.job_spec.replica_num
+                and other_job.job_spec.job_num != job.job_spec.job_num
+                and other_job.job_submissions[-1].status != JobStatus.RUNNING
+            ):
+                logger.debug(
+                    "%s: waiting for worker job to become running",
+                    fmt(job_model),
+                )
+                return True
+    return False
 @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
 def _process_provisioning_with_shim(
     ports: Dict[int, int],

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -10,7 +10,7 @@ from sqlalchemy.orm import joinedload, selectinload
 import dstack._internal.server.services.gateways as gateways
 import dstack._internal.server.services.services.autoscalers as autoscalers
 from dstack._internal.core.errors import ServerError
-from dstack._internal.core.models.profiles import RetryEvent
+from dstack._internal.core.models.profiles import RetryEvent, StopCriteria
 from dstack._internal.core.models.runs import (
     Job,
     JobStatus,
@@ -313,6 +313,10 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             termination_reason = RunTerminationReason.RETRY_LIMIT_EXCEEDED
         else:
             raise ValueError(f"Unexpected termination reason {run_termination_reasons}")
+    elif _should_stop_on_master_done(run):
+        new_status = RunStatus.TERMINATING
+        # ALL_JOBS_DONE is used for all DONE reasons including master-done
+        termination_reason = RunTerminationReason.ALL_JOBS_DONE
     elif RunStatus.RUNNING in run_statuses:
         new_status = RunStatus.RUNNING
     elif RunStatus.PROVISIONING in run_statuses:
@@ -434,3 +438,12 @@ def _can_retry_single_job(run_spec: RunSpec) -> bool:
     # We could make partial retry in some multi-node cases.
     # E.g. restarting a worker node, independent jobs.
     return False
+def _should_stop_on_master_done(run: Run) -> bool:
+    if run.run_spec.merged_profile.stop_criteria != StopCriteria.MASTER_DONE:
+        return False
+    for job in run.jobs:
+        if job.job_spec.job_num == 0 and job.job_submissions[-1].status == JobStatus.DONE:
+            return True
+    return False

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -659,7 +659,7 @@ async def _attach_volumes(
                         backend=backend,
                         volume_model=volume_model,
                         instance=instance,
-                        instance_id=job_provisioning_data.instance_id,
+                        jpd=job_provisioning_data,
                     )
                     job_runtime_data.volume_names.append(volume.name)
                     break  # attach next mount point
@@ -685,7 +685,7 @@ async def _attach_volume(
     backend: Backend,
     volume_model: VolumeModel,
     instance: InstanceModel,
-    instance_id: str,
+    jpd: JobProvisioningData,
 ):
     compute = backend.compute()
     assert isinstance(compute, ComputeWithVolumeSupport)
@@ -697,7 +697,7 @@ async def _attach_volume(
     attachment_data = await common_utils.run_async(
         compute.attach_volume,
         volume=volume,
-        instance_id=instance_id,
+        provisioning_data=jpd,
     )
     volume_attachment_model = VolumeAttachmentModel(
         volume=volume_model,

dstack/_internal/server/routers/repos.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import List, Tuple
 from fastapi import APIRouter, Depends, Request, UploadFile
-from humanize import naturalsize
 from sqlalchemy.ext.asyncio import AsyncSession
 from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError
@@ -20,6 +19,7 @@ from dstack._internal.server.utils.routers import (
     get_base_api_additional_responses,
     get_request_size,
 )
+from dstack._internal.utils.common import sizeof_fmt
 router = APIRouter(
     prefix="/api/project/{project_name}/repos",
@@ -98,10 +98,15 @@ async def upload_code(
 ):
     request_size = get_request_size(request)
     if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT:
+        diff_size_fmt = sizeof_fmt(request_size)
+        limit_fmt = sizeof_fmt(SERVER_CODE_UPLOAD_LIMIT)
+        if diff_size_fmt == limit_fmt:
+            diff_size_fmt = f"{request_size}B"
+            limit_fmt = f"{SERVER_CODE_UPLOAD_LIMIT}B"
         raise ServerClientError(
-            f"Repo diff size is {naturalsize(request_size)}, which exceeds the limit of "
-            f"{naturalsize(SERVER_CODE_UPLOAD_LIMIT)}. Use .gitignore to exclude large files from the repo. This "
-            f"limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT_BYTES environment variable"
+            f"Repo diff size is {diff_size_fmt}, which exceeds the limit of {limit_fmt}."
+            " Use .gitignore to exclude large files from the repo."
+            " This limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT environment variable."
         )
     _, project = user_project
     await repos.upload_code(

dstack/_internal/server/services/fleets.py CHANGED Viewed

@@ -237,7 +237,7 @@ async def get_plan(
 ) -> FleetPlan:
     # Spec must be copied by parsing to calculate merged_profile
     effective_spec = FleetSpec.parse_obj(spec.dict())
-    effective_spec = apply_plugin_policies(
+    effective_spec = await apply_plugin_policies(
         user=user.name,
         project=project.name,
         spec=effective_spec,
@@ -342,7 +342,7 @@ async def create_fleet(
     spec: FleetSpec,
 ) -> Fleet:
     # Spec must be copied by parsing to calculate merged_profile
-    spec = apply_plugin_policies(
+    spec = await apply_plugin_policies(
         user=user.name,
         project=project.name,
         spec=spec,

dstack/_internal/server/services/gateways/__init__.py CHANGED Viewed

@@ -140,7 +140,7 @@ async def create_gateway(
     project: ProjectModel,
     configuration: GatewayConfiguration,
 ) -> Gateway:
-    spec = apply_plugin_policies(
+    spec = await apply_plugin_policies(
         user=user.name,
         project=project.name,
         # Create pseudo spec until the gateway API is updated to accept spec

dstack/_internal/server/services/jobs/__init__.py CHANGED Viewed

@@ -470,20 +470,20 @@ async def _detach_volume_from_job_instance(
             await run_async(
                 compute.detach_volume,
                 volume=volume,
-                instance_id=jpd.instance_id,
+                provisioning_data=jpd,
                 force=False,
             )
             # For some backends, the volume may be detached immediately
             detached = await run_async(
                 compute.is_volume_detached,
                 volume=volume,
-                instance_id=jpd.instance_id,
+                provisioning_data=jpd,
             )
         else:
             detached = await run_async(
                 compute.is_volume_detached,
                 volume=volume,
-                instance_id=jpd.instance_id,
+                provisioning_data=jpd,
             )
             if not detached and _should_force_detach_volume(job_model, job_spec.stop_duration):
                 logger.info(
@@ -494,7 +494,7 @@ async def _detach_volume_from_job_instance(
                 await run_async(
                     compute.detach_volume,
                     volume=volume,
-                    instance_id=jpd.instance_id,
+                    provisioning_data=jpd,
                     force=True,
                 )
                 # Let the next iteration check if force detach worked

dstack/_internal/server/services/plugins.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import itertools
 from importlib import import_module
+from typing import Dict
 from backports.entry_points_selectable import entry_points  # backport for Python 3.9
 from dstack._internal.core.errors import ServerClientError
+from dstack._internal.utils.common import run_async
 from dstack._internal.utils.logging import get_logger
 from dstack.plugins import ApplyPolicy, ApplySpec, Plugin
@@ -12,59 +14,89 @@ logger = get_logger(__name__)
 _PLUGINS: list[Plugin] = []
+_BUILTIN_PLUGINS: Dict[str, str] = {"rest_plugin": "dstack.plugins.builtin.rest_plugin:RESTPlugin"}
-def load_plugins(enabled_plugins: list[str]):
-    _PLUGINS.clear()
-    plugins_entrypoints = entry_points(group="dstack.plugins")
-    plugins_to_load = enabled_plugins.copy()
-    for entrypoint in plugins_entrypoints:
-        if entrypoint.name not in enabled_plugins:
-            logger.info(
-                ("Found not enabled plugin %s. Plugin will not be loaded."),
-                entrypoint.name,
-            )
-            continue
+class PluginEntrypoint:
+    def __init__(self, name: str, import_path: str, is_builtin: bool = False):
+        self.name = name
+        self.import_path = import_path
+        self.is_builtin = is_builtin
+    def load(self):
+        module_path, _, class_name = self.import_path.partition(":")
         try:
-            module_path, _, class_name = entrypoint.value.partition(":")
             module = import_module(module_path)
+            plugin_class = getattr(module, class_name, None)
+            if plugin_class is None:
+                logger.warning(
+                    ("Failed to load plugin %s: plugin class %s not found in module %s."),
+                    self.name,
+                    class_name,
+                    module_path,
+                )
+                return None
+            if not issubclass(plugin_class, Plugin):
+                logger.warning(
+                    ("Failed to load plugin %s: plugin class %s is not a subclass of Plugin."),
+                    self.name,
+                    class_name,
+                )
+                return None
+            return plugin_class()
         except ImportError:
             logger.warning(
                 (
                     "Failed to load plugin %s when importing %s."
                     " Ensure the module is on the import path."
                 ),
-                entrypoint.name,
-                entrypoint.value,
+                self.name,
+                self.import_path,
             )
-            continue
-        plugin_class = getattr(module, class_name, None)
-        if plugin_class is None:
-            logger.warning(
-                ("Failed to load plugin %s: plugin class %s not found in module %s."),
+            return None
+def load_plugins(enabled_plugins: list[str]):
+    _PLUGINS.clear()
+    entrypoints: dict[str, PluginEntrypoint] = {}
+    plugins_to_load = enabled_plugins.copy()
+    for entrypoint in entry_points(group="dstack.plugins"):
+        if entrypoint.name not in enabled_plugins:
+            logger.info(
+                ("Found not enabled plugin %s. Plugin will not be loaded."),
                 entrypoint.name,
-                class_name,
-                module_path,
             )
             continue
-        if not issubclass(plugin_class, Plugin):
-            logger.warning(
-                ("Failed to load plugin %s: plugin class %s is not a subclass of Plugin."),
-                entrypoint.name,
-                class_name,
+        else:
+            entrypoints[entrypoint.name] = PluginEntrypoint(
+                entrypoint.name, entrypoint.value, is_builtin=False
             )
-            continue
-        plugins_to_load.remove(entrypoint.name)
-        _PLUGINS.append(plugin_class())
-        logger.info("Loaded plugin %s", entrypoint.name)
+    for name, import_path in _BUILTIN_PLUGINS.items():
+        if name not in enabled_plugins:
+            logger.info(
+                ("Found not enabled builtin plugin %s. Plugin will not be loaded."),
+                name,
+            )
+        else:
+            entrypoints[name] = PluginEntrypoint(name, import_path, is_builtin=True)
+    for plugin_name, plugin_entrypoint in entrypoints.items():
+        plugin_instance = plugin_entrypoint.load()
+        if plugin_instance is not None:
+            _PLUGINS.append(plugin_instance)
+            plugins_to_load.remove(plugin_name)
+            logger.info("Loaded plugin %s", plugin_name)
     if plugins_to_load:
         logger.warning("Enabled plugins not found: %s", plugins_to_load)
-def apply_plugin_policies(user: str, project: str, spec: ApplySpec) -> ApplySpec:
+async def apply_plugin_policies(user: str, project: str, spec: ApplySpec) -> ApplySpec:
     policies = _get_apply_policies()
     for policy in policies:
         try:
-            spec = policy.on_apply(user=user, project=project, spec=spec)
+            spec = await run_async(policy.on_apply, user=user, project=project, spec=spec)
         except ValueError as e:
             msg = None
             if len(e.args) > 0:

dstack 0.19.11rc1__py3-none-any.whl → 0.19.12__py3-none-any.whl

Potentially problematic release.

dstack 0.19.11rc1py3-none-any.whl → 0.19.12py3-none-any.whl