PyPI - dstack - Versions diffs - 0.19.12rc1__py3-none-any.whl → 0.19.13__py3-none-any.whl - Mend

dstack 0.19.12rc1py3-none-any.whl → 0.19.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (44) hide show

dstack/_internal/cli/services/configurators/run.py +43 -47
dstack/_internal/cli/utils/run.py +15 -27
dstack/_internal/core/backends/aws/compute.py +22 -9
dstack/_internal/core/backends/aws/resources.py +26 -0
dstack/_internal/core/backends/base/offers.py +0 -1
dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
dstack/_internal/core/backends/template/models.py.jinja +4 -0
dstack/_internal/core/compatibility/__init__.py +0 -0
dstack/_internal/core/compatibility/fleets.py +72 -0
dstack/_internal/core/compatibility/gateways.py +34 -0
dstack/_internal/core/compatibility/runs.py +125 -0
dstack/_internal/core/compatibility/volumes.py +32 -0
dstack/_internal/core/models/configurations.py +1 -1
dstack/_internal/core/models/fleets.py +6 -1
dstack/_internal/core/models/instances.py +51 -12
dstack/_internal/core/models/profiles.py +43 -3
dstack/_internal/core/models/repos/local.py +3 -3
dstack/_internal/core/models/runs.py +118 -44
dstack/_internal/server/app.py +1 -1
dstack/_internal/server/background/tasks/process_running_jobs.py +47 -12
dstack/_internal/server/background/tasks/process_runs.py +14 -1
dstack/_internal/server/services/runner/client.py +4 -1
dstack/_internal/server/services/storage/__init__.py +38 -0
dstack/_internal/server/services/storage/base.py +27 -0
dstack/_internal/server/services/storage/gcs.py +44 -0
dstack/_internal/server/services/{storage.py → storage/s3.py} +4 -27
dstack/_internal/server/settings.py +7 -3
dstack/_internal/server/statics/index.html +1 -1
dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js → main-2066f1f22ddb4557bcde.js} +1677 -46
dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js.map → main-2066f1f22ddb4557bcde.js.map} +1 -1
dstack/_internal/server/statics/{main-8f9c66f404e9c7e7e020.css → main-f39c418b05fe14772dd8.css} +1 -1
dstack/_internal/server/testing/common.py +2 -1
dstack/_internal/utils/common.py +4 -0
dstack/api/server/_fleets.py +9 -69
dstack/api/server/_gateways.py +3 -14
dstack/api/server/_runs.py +4 -116
dstack/api/server/_volumes.py +3 -14
dstack/plugins/builtin/rest_plugin/_plugin.py +24 -5
dstack/version.py +2 -2
{dstack-0.19.12rc1.dist-info → dstack-0.19.13.dist-info}/METADATA +1 -1
{dstack-0.19.12rc1.dist-info → dstack-0.19.13.dist-info}/RECORD +44 -36
{dstack-0.19.12rc1.dist-info → dstack-0.19.13.dist-info}/WHEEL +0 -0
{dstack-0.19.12rc1.dist-info → dstack-0.19.13.dist-info}/entry_points.txt +0 -0
{dstack-0.19.12rc1.dist-info → dstack-0.19.13.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -440,7 +440,7 @@ class ServiceConfigurationParams(CoreModel):
             raise ValueError("The minimum number of replicas must be greater than or equal to 0")
         if v.max < v.min:
             raise ValueError(
-                "The maximum number of replicas must be greater than or equal to the minium number of replicas"
+                "The maximum number of replicas must be greater than or equal to the minimum number of replicas"
             )
         return v

dstack/_internal/core/models/fleets.py CHANGED Viewed

@@ -20,6 +20,7 @@ from dstack._internal.core.models.profiles import (
     parse_idle_duration,
 )
 from dstack._internal.core.models.resources import Range, ResourcesSpec
+from dstack._internal.utils.common import list_enum_values_for_annotation
 from dstack._internal.utils.json_schema import add_extra_schema_types
 from dstack._internal.utils.tags import tags_validator
@@ -207,7 +208,11 @@ class InstanceGroupParams(CoreModel):
     spot_policy: Annotated[
         Optional[SpotPolicy],
         Field(
-            description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`"
+            description=(
+                "The policy for provisioning spot or on-demand instances:"
+                f" {list_enum_values_for_annotation(SpotPolicy)}."
+                f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
+            )
         ),
     ] = None
     retry: Annotated[

dstack/_internal/core/models/instances.py CHANGED Viewed

@@ -48,29 +48,68 @@ class Resources(CoreModel):
     gpus: List[Gpu]
     spot: bool
     disk: Disk = Disk(size_mib=102400)  # the default value (100GB) for backward compatibility
+    # TODO: make description a computed field after migrating to pydanticV2
     description: str = ""
     cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
-    def pretty_format(self, include_spot: bool = False) -> str:
+    @root_validator
+    def _description(cls, values) -> Dict:
+        try:
+            description = values["description"]
+            if not description:
+                cpus = values["cpus"]
+                memory_mib = values["memory_mib"]
+                gpus = values["gpus"]
+                disk_size_mib = values["disk"].size_mib
+                spot = values["spot"]
+                cpu_arch = values["cpu_arch"]
+                values["description"] = Resources._pretty_format(
+                    cpus, cpu_arch, memory_mib, disk_size_mib, gpus, spot, include_spot=True
+                )
+        except KeyError:
+            return values
+        return values
+    @staticmethod
+    def _pretty_format(
+        cpus: int,
+        cpu_arch: Optional[gpuhunt.CPUArchitecture],
+        memory_mib: int,
+        disk_size_mib: int,
+        gpus: List[Gpu],
+        spot: bool,
+        include_spot: bool = False,
+    ) -> str:
         resources = {}
-        if self.cpus > 0:
-            resources["cpus"] = self.cpus
-            resources["cpu_arch"] = self.cpu_arch
-        if self.memory_mib > 0:
-            resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
-        if self.disk.size_mib > 0:
-            resources["disk_size"] = f"{self.disk.size_mib / 1024:.0f}GB"
-        if self.gpus:
-            gpu = self.gpus[0]
+        if cpus > 0:
+            resources["cpus"] = cpus
+            resources["cpu_arch"] = cpu_arch
+        if memory_mib > 0:
+            resources["memory"] = f"{memory_mib / 1024:.0f}GB"
+        if disk_size_mib > 0:
+            resources["disk_size"] = f"{disk_size_mib / 1024:.0f}GB"
+        if gpus:
+            gpu = gpus[0]
             resources["gpu_name"] = gpu.name
-            resources["gpu_count"] = len(self.gpus)
+            resources["gpu_count"] = len(gpus)
             if gpu.memory_mib > 0:
                 resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB"
         output = pretty_resources(**resources)
-        if include_spot and self.spot:
+        if include_spot and spot:
             output += " (spot)"
         return output
+    def pretty_format(self, include_spot: bool = False) -> str:
+        return Resources._pretty_format(
+            self.cpus,
+            self.cpu_arch,
+            self.memory_mib,
+            self.disk.size_mib,
+            self.gpus,
+            self.spot,
+            include_spot,
+        )
 class InstanceType(CoreModel):
     name: str

dstack/_internal/core/models/profiles.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing_extensions import Annotated, Literal
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import CoreModel, Duration
+from dstack._internal.utils.common import list_enum_values_for_annotation
 from dstack._internal.utils.tags import tags_validator
 DEFAULT_RETRY_DURATION = 3600
@@ -32,6 +33,17 @@ class TerminationPolicy(str, Enum):
     DESTROY_AFTER_IDLE = "destroy-after-idle"
+class StartupOrder(str, Enum):
+    ANY = "any"
+    MASTER_FIRST = "master-first"
+    WORKERS_FIRST = "workers-first"
+class StopCriteria(str, Enum):
+    ALL_DONE = "all-done"
+    MASTER_DONE = "master-done"
 @overload
 def parse_duration(v: None) -> None: ...
@@ -102,7 +114,7 @@ class ProfileRetry(CoreModel):
         Field(
             description=(
                 "The list of events that should be handled with retry."
-                " Supported events are `no-capacity`, `interruption`, and `error`."
+                f" Supported events are {list_enum_values_for_annotation(RetryEvent)}."
                 " Omit to retry on all events"
             )
         ),
@@ -190,7 +202,11 @@ class ProfileParams(CoreModel):
     spot_policy: Annotated[
         Optional[SpotPolicy],
         Field(
-            description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`. Defaults to `on-demand`"
+            description=(
+                "The policy for provisioning spot or on-demand instances:"
+                f" {list_enum_values_for_annotation(SpotPolicy)}."
+                f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
+            )
         ),
     ] = None
     retry: Annotated[
@@ -225,7 +241,11 @@ class ProfileParams(CoreModel):
     creation_policy: Annotated[
         Optional[CreationPolicy],
         Field(
-            description="The policy for using instances from fleets. Defaults to `reuse-or-create`"
+            description=(
+                "The policy for using instances from fleets:"
+                f" {list_enum_values_for_annotation(CreationPolicy)}."
+                f" Defaults to `{CreationPolicy.REUSE_OR_CREATE.value}`"
+            )
         ),
     ] = None
     idle_duration: Annotated[
@@ -241,6 +261,26 @@ class ProfileParams(CoreModel):
         Optional[UtilizationPolicy],
         Field(description="Run termination policy based on utilization"),
     ] = None
+    startup_order: Annotated[
+        Optional[StartupOrder],
+        Field(
+            description=(
+                f"The order in which master and workers jobs are started:"
+                f" {list_enum_values_for_annotation(StartupOrder)}."
+                f" Defaults to `{StartupOrder.ANY.value}`"
+            )
+        ),
+    ] = None
+    stop_criteria: Annotated[
+        Optional[StopCriteria],
+        Field(
+            description=(
+                "The criteria determining when a multi-node run should be considered finished:"
+                f" {list_enum_values_for_annotation(StopCriteria)}."
+                f" Defaults to `{StopCriteria.ALL_DONE.value}`"
+            )
+        ),
+    ] = None
     fleets: Annotated[
         Optional[list[str]], Field(description="The fleets considered for reuse")
     ] = None

dstack/_internal/core/models/repos/local.py CHANGED Viewed

@@ -84,9 +84,9 @@ class LocalRepo(Repo):
                 .add_custom_ignore_filename(".dstackignore")
                 .build()
             ):
-                path = entry.path().relative_to(repo_path.absolute())
-                if path != Path("."):
-                    t.add(path, recursive=False)
+                entry_path_within_repo = entry.path().relative_to(repo_path)
+                if entry_path_within_repo != Path("."):
+                    t.add(entry.path(), arcname=entry_path_within_repo, recursive=False)
         logger.debug("Code file size: %s", sizeof_fmt(fp.tell()))
         return get_sha256(fp)

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -148,9 +148,6 @@ class JobTerminationReason(str, Enum):
         }
         return mapping[self]
-    def pretty_repr(self) -> str:
-        return " ".join(self.value.split("_")).capitalize()
 class Requirements(CoreModel):
     # TODO: Make requirements' fields required
@@ -289,6 +286,9 @@ class JobSubmission(CoreModel):
     exit_status: Optional[int]
     job_provisioning_data: Optional[JobProvisioningData]
     job_runtime_data: Optional[JobRuntimeData]
+    # TODO: make status_message and error a computed field after migrating to pydanticV2
+    status_message: Optional[str]
+    error: Optional[str] = None
     @property
     def age(self) -> timedelta:
@@ -301,6 +301,71 @@ class JobSubmission(CoreModel):
             end_time = self.finished_at
         return end_time - self.submitted_at
+    @root_validator
+    def _status_message(cls, values) -> Dict:
+        try:
+            status = values["status"]
+            termination_reason = values["termination_reason"]
+            exit_code = values["exit_status"]
+        except KeyError:
+            return values
+        values["status_message"] = JobSubmission._get_status_message(
+            status=status,
+            termination_reason=termination_reason,
+            exit_status=exit_code,
+        )
+        return values
+    @staticmethod
+    def _get_status_message(
+        status: JobStatus,
+        termination_reason: Optional[JobTerminationReason],
+        exit_status: Optional[int],
+    ) -> str:
+        if status == JobStatus.DONE:
+            return "exited (0)"
+        elif status == JobStatus.FAILED:
+            if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
+                return f"exited ({exit_status})"
+            elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
+                return "no offers"
+            elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
+                return "interrupted"
+            else:
+                return "error"
+        elif status == JobStatus.TERMINATED:
+            if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
+                return "stopped"
+            elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
+                return "aborted"
+        return status.value
+    @root_validator
+    def _error(cls, values) -> Dict:
+        try:
+            termination_reason = values["termination_reason"]
+        except KeyError:
+            return values
+        values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
+        return values
+    @staticmethod
+    def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
+        error_mapping = {
+            JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
+            JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
+            JobTerminationReason.VOLUME_ERROR: "waiting runner limit exceeded",
+            JobTerminationReason.GATEWAY_ERROR: "gateway error",
+            JobTerminationReason.SCALED_DOWN: "scaled down",
+            JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
+            JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
+            JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
+            JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
+            JobTerminationReason.EXECUTOR_ERROR: "executor error",
+            JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
+        }
+        return error_mapping.get(termination_reason)
 class Job(CoreModel):
     job_spec: JobSpec
@@ -431,6 +496,7 @@ class Run(CoreModel):
     submitted_at: datetime
     last_processed_at: datetime
     status: RunStatus
+    status_message: Optional[str] = None
     termination_reason: Optional[RunTerminationReason]
     run_spec: RunSpec
     jobs: List[Job]
@@ -445,15 +511,60 @@ class Run(CoreModel):
     def _error(cls, values) -> Dict:
         try:
             termination_reason = values["termination_reason"]
-            jobs = values["jobs"]
         except KeyError:
             return values
-        values["error"] = _get_run_error(
-            run_termination_reason=termination_reason,
-            run_jobs=jobs,
+        values["error"] = Run._get_error(termination_reason=termination_reason)
+        return values
+    @staticmethod
+    def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
+        if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
+            return "retry limit exceeded"
+        elif termination_reason == RunTerminationReason.SERVER_ERROR:
+            return "server error"
+        else:
+            return None
+    @root_validator
+    def _status_message(cls, values) -> Dict:
+        try:
+            status = values["status"]
+            jobs: List[Job] = values["jobs"]
+            retry_on_events = (
+                jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
+            )
+            termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
+        except KeyError:
+            return values
+        values["status_message"] = Run._get_status_message(
+            status=status,
+            retry_on_events=retry_on_events,
+            termination_reason=termination_reason,
         )
         return values
+    @staticmethod
+    def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
+        for submission in reversed(job.job_submissions):
+            if submission.termination_reason is not None:
+                return submission.termination_reason
+        return None
+    @staticmethod
+    def _get_status_message(
+        status: RunStatus,
+        retry_on_events: List[RetryEvent],
+        termination_reason: Optional[JobTerminationReason],
+    ) -> str:
+        # Currently, `retrying` is shown only for `no-capacity` events
+        if (
+            status in [RunStatus.SUBMITTED, RunStatus.PENDING]
+            and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
+            and RetryEvent.NO_CAPACITY in retry_on_events
+        ):
+            return "retrying"
+        return status.value
 class JobPlan(CoreModel):
     job_spec: JobSpec
@@ -502,40 +613,3 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
         SpotPolicy.ONDEMAND: False,
     }
     return policy_map[spot_policy]
-def _get_run_error(
-    run_termination_reason: Optional[RunTerminationReason],
-    run_jobs: List[Job],
-) -> str:
-    if run_termination_reason is None:
-        return ""
-    if len(run_jobs) > 1:
-        return run_termination_reason.name
-    run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
-        run_jobs
-    )
-    # For failed runs, also show termination reason to provide more context.
-    # For other run statuses, the job termination reason will duplicate run status.
-    if run_job_termination_reason is not None and run_termination_reason in [
-        RunTerminationReason.JOB_FAILED,
-        RunTerminationReason.SERVER_ERROR,
-        RunTerminationReason.RETRY_LIMIT_EXCEEDED,
-    ]:
-        if exit_status:
-            return (
-                f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
-            )
-        return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
-    return run_termination_reason.name
-def _get_run_job_termination_reason_and_exit_status(
-    run_jobs: List[Job],
-) -> tuple[Optional[JobTerminationReason], Optional[int]]:
-    for job in run_jobs:
-        if len(job.job_submissions) > 0:
-            job_submission = job.job_submissions[-1]
-            if job_submission.termination_reason is not None:
-                return job_submission.termination_reason, job_submission.exit_status
-    return None, None

dstack/_internal/server/app.py CHANGED Viewed

@@ -128,7 +128,7 @@ async def lifespan(app: FastAPI):
         yes=UPDATE_DEFAULT_PROJECT,
         no=DO_NOT_UPDATE_DEFAULT_PROJECT,
     )
-    if settings.SERVER_BUCKET is not None:
+    if settings.SERVER_S3_BUCKET is not None or settings.SERVER_GCS_BUCKET is not None:
         init_default_storage()
     scheduler = start_background_tasks()
     dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"

dstack/_internal/server/background/tasks/process_running_jobs.py CHANGED Viewed

@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
     SSHConnectionParams,
 )
 from dstack._internal.core.models.metrics import Metric
+from dstack._internal.core.models.profiles import StartupOrder
 from dstack._internal.core.models.repos import RemoteRepoCreds
 from dstack._internal.core.models.runs import (
     ClusterInfo,
@@ -184,18 +185,10 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
         if job_provisioning_data.hostname is None:
             await _wait_for_instance_provisioning_data(job_model=job_model)
         else:
-            # Wait until all other jobs in the replica have IPs assigned.
-            # This is needed to ensure cluster_info has all IPs set.
-            for other_job in run.jobs:
-                if (
-                    other_job.job_spec.replica_num == job.job_spec.replica_num
-                    and other_job.job_submissions[-1].status == JobStatus.PROVISIONING
-                    and other_job.job_submissions[-1].job_provisioning_data is not None
-                    and other_job.job_submissions[-1].job_provisioning_data.hostname is None
-                ):
-                    job_model.last_processed_at = common_utils.get_current_datetime()
-                    await session.commit()
-                    return
+            if _should_wait_for_other_nodes(run, job, job_model):
+                job_model.last_processed_at = common_utils.get_current_datetime()
+                await session.commit()
+                return
             # fails are acceptable until timeout is exceeded
             if job_provisioning_data.dockerized:
@@ -406,6 +399,48 @@ async def _wait_for_instance_provisioning_data(job_model: JobModel):
     job_model.job_provisioning_data = job_model.instance.job_provisioning_data
+def _should_wait_for_other_nodes(run: Run, job: Job, job_model: JobModel) -> bool:
+    for other_job in run.jobs:
+        if (
+            other_job.job_spec.replica_num == job.job_spec.replica_num
+            and other_job.job_submissions[-1].status == JobStatus.PROVISIONING
+            and other_job.job_submissions[-1].job_provisioning_data is not None
+            and other_job.job_submissions[-1].job_provisioning_data.hostname is None
+        ):
+            logger.debug(
+                "%s: waiting for other job to have IP assigned",
+                fmt(job_model),
+            )
+            return True
+    master_job = find_job(run.jobs, job.job_spec.replica_num, 0)
+    if (
+        job.job_spec.job_num != 0
+        and run.run_spec.merged_profile.startup_order == StartupOrder.MASTER_FIRST
+        and master_job.job_submissions[-1].status != JobStatus.RUNNING
+    ):
+        logger.debug(
+            "%s: waiting for master job to become running",
+            fmt(job_model),
+        )
+        return True
+    if (
+        job.job_spec.job_num == 0
+        and run.run_spec.merged_profile.startup_order == StartupOrder.WORKERS_FIRST
+    ):
+        for other_job in run.jobs:
+            if (
+                other_job.job_spec.replica_num == job.job_spec.replica_num
+                and other_job.job_spec.job_num != job.job_spec.job_num
+                and other_job.job_submissions[-1].status != JobStatus.RUNNING
+            ):
+                logger.debug(
+                    "%s: waiting for worker job to become running",
+                    fmt(job_model),
+                )
+                return True
+    return False
 @runner_ssh_tunnel(ports=[DSTACK_SHIM_HTTP_PORT], retries=1)
 def _process_provisioning_with_shim(
     ports: Dict[int, int],

dstack/_internal/server/background/tasks/process_runs.py CHANGED Viewed

@@ -10,7 +10,7 @@ from sqlalchemy.orm import joinedload, selectinload
 import dstack._internal.server.services.gateways as gateways
 import dstack._internal.server.services.services.autoscalers as autoscalers
 from dstack._internal.core.errors import ServerError
-from dstack._internal.core.models.profiles import RetryEvent
+from dstack._internal.core.models.profiles import RetryEvent, StopCriteria
 from dstack._internal.core.models.runs import (
     Job,
     JobStatus,
@@ -313,6 +313,10 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
             termination_reason = RunTerminationReason.RETRY_LIMIT_EXCEEDED
         else:
             raise ValueError(f"Unexpected termination reason {run_termination_reasons}")
+    elif _should_stop_on_master_done(run):
+        new_status = RunStatus.TERMINATING
+        # ALL_JOBS_DONE is used for all DONE reasons including master-done
+        termination_reason = RunTerminationReason.ALL_JOBS_DONE
     elif RunStatus.RUNNING in run_statuses:
         new_status = RunStatus.RUNNING
     elif RunStatus.PROVISIONING in run_statuses:
@@ -434,3 +438,12 @@ def _can_retry_single_job(run_spec: RunSpec) -> bool:
     # We could make partial retry in some multi-node cases.
     # E.g. restarting a worker node, independent jobs.
     return False
+def _should_stop_on_master_done(run: Run) -> bool:
+    if run.run_spec.merged_profile.stop_criteria != StopCriteria.MASTER_DONE:
+        return False
+    for job in run.jobs:
+        if job.job_spec.job_num == 0 and job.job_submissions[-1].status == JobStatus.DONE:
+            return True
+    return False

dstack/_internal/server/services/runner/client.py CHANGED Viewed

@@ -32,6 +32,7 @@ from dstack._internal.utils.common import get_or_error
 from dstack._internal.utils.logging import get_logger
 REQUEST_TIMEOUT = 9
+UPLOAD_CODE_REQUEST_TIMEOUT = 60
 logger = get_logger(__name__)
@@ -109,7 +110,9 @@ class RunnerClient:
         resp.raise_for_status()
     def upload_code(self, file: Union[BinaryIO, bytes]):
-        resp = requests.post(self._url("/api/upload_code"), data=file, timeout=REQUEST_TIMEOUT)
+        resp = requests.post(
+            self._url("/api/upload_code"), data=file, timeout=UPLOAD_CODE_REQUEST_TIMEOUT
+        )
         resp.raise_for_status()
     def run_job(self):

dstack/_internal/server/services/storage/__init__.py ADDED Viewed

@@ -0,0 +1,38 @@
+from typing import Optional
+from dstack._internal.server import settings
+from dstack._internal.server.services.storage.base import BaseStorage
+from dstack._internal.server.services.storage.gcs import GCS_AVAILABLE, GCSStorage
+from dstack._internal.server.services.storage.s3 import BOTO_AVAILABLE, S3Storage
+_default_storage = None
+def init_default_storage():
+    global _default_storage
+    if settings.SERVER_S3_BUCKET is None and settings.SERVER_GCS_BUCKET is None:
+        raise ValueError(
+            "Either settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET must be set"
+        )
+    if settings.SERVER_S3_BUCKET and settings.SERVER_GCS_BUCKET:
+        raise ValueError(
+            "Only one of settings.SERVER_S3_BUCKET or settings.SERVER_GCS_BUCKET can be set"
+        )
+    if settings.SERVER_S3_BUCKET:
+        if not BOTO_AVAILABLE:
+            raise ValueError("AWS dependencies are not installed")
+        _default_storage = S3Storage(
+            bucket=settings.SERVER_S3_BUCKET,
+            region=settings.SERVER_S3_BUCKET_REGION,
+        )
+    elif settings.SERVER_GCS_BUCKET:
+        if not GCS_AVAILABLE:
+            raise ValueError("GCS dependencies are not installed")
+        _default_storage = GCSStorage(
+            bucket=settings.SERVER_GCS_BUCKET,
+        )
+def get_default_storage() -> Optional[BaseStorage]:
+    return _default_storage

dstack/_internal/server/services/storage/base.py ADDED Viewed

@@ -0,0 +1,27 @@
+from abc import ABC, abstractmethod
+from typing import Optional
+class BaseStorage(ABC):
+    @abstractmethod
+    def upload_code(
+        self,
+        project_id: str,
+        repo_id: str,
+        code_hash: str,
+        blob: bytes,
+    ):
+        pass
+    @abstractmethod
+    def get_code(
+        self,
+        project_id: str,
+        repo_id: str,
+        code_hash: str,
+    ) -> Optional[bytes]:
+        pass
+    @staticmethod
+    def _get_code_key(project_id: str, repo_id: str, code_hash: str) -> str:
+        return f"data/projects/{project_id}/codes/{repo_id}/{code_hash}"

dstack/_internal/server/services/storage/gcs.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Optional
+from dstack._internal.server.services.storage.base import BaseStorage
+GCS_AVAILABLE = True
+try:
+    from google.cloud import storage
+    from google.cloud.exceptions import NotFound
+except ImportError:
+    GCS_AVAILABLE = False
+class GCSStorage(BaseStorage):
+    def __init__(
+        self,
+        bucket: str,
+    ):
+        self._client = storage.Client()
+        self._bucket = self._client.bucket(bucket)
+    def upload_code(
+        self,
+        project_id: str,
+        repo_id: str,
+        code_hash: str,
+        blob: bytes,
+    ):
+        blob_name = self._get_code_key(project_id, repo_id, code_hash)
+        blob_obj = self._bucket.blob(blob_name)
+        blob_obj.upload_from_string(blob)
+    def get_code(
+        self,
+        project_id: str,
+        repo_id: str,
+        code_hash: str,
+    ) -> Optional[bytes]:
+        try:
+            blob_name = self._get_code_key(project_id, repo_id, code_hash)
+            blob = self._bucket.blob(blob_name)
+        except NotFound:
+            return None
+        return blob.download_as_bytes()

dstack 0.19.12rc1__py3-none-any.whl → 0.19.13__py3-none-any.whl

Potentially problematic release.

dstack 0.19.12rc1py3-none-any.whl → 0.19.13py3-none-any.whl