PyPI - dstack - Versions diffs - 0.19.16__py3-none-any.whl → 0.19.18__py3-none-any.whl - Mend

dstack 0.19.16py3-none-any.whl → 0.19.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (80) hide show

dstack/_internal/cli/commands/secrets.py +92 -0
dstack/_internal/cli/main.py +2 -0
dstack/_internal/cli/services/completion.py +5 -0
dstack/_internal/cli/services/configurators/fleet.py +13 -1
dstack/_internal/cli/services/configurators/run.py +59 -17
dstack/_internal/cli/utils/secrets.py +25 -0
dstack/_internal/core/backends/__init__.py +10 -4
dstack/_internal/core/backends/aws/compute.py +237 -18
dstack/_internal/core/backends/base/compute.py +20 -2
dstack/_internal/core/backends/cudo/compute.py +23 -9
dstack/_internal/core/backends/gcp/compute.py +13 -7
dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
dstack/_internal/core/compatibility/fleets.py +12 -11
dstack/_internal/core/compatibility/gateways.py +9 -8
dstack/_internal/core/compatibility/logs.py +4 -3
dstack/_internal/core/compatibility/runs.py +41 -17
dstack/_internal/core/compatibility/volumes.py +9 -8
dstack/_internal/core/errors.py +4 -0
dstack/_internal/core/models/common.py +7 -0
dstack/_internal/core/models/configurations.py +11 -0
dstack/_internal/core/models/files.py +67 -0
dstack/_internal/core/models/runs.py +14 -0
dstack/_internal/core/models/secrets.py +9 -2
dstack/_internal/core/services/diff.py +36 -3
dstack/_internal/server/app.py +22 -0
dstack/_internal/server/background/__init__.py +61 -37
dstack/_internal/server/background/tasks/process_fleets.py +19 -3
dstack/_internal/server/background/tasks/process_gateways.py +1 -1
dstack/_internal/server/background/tasks/process_instances.py +13 -2
dstack/_internal/server/background/tasks/process_placement_groups.py +4 -2
dstack/_internal/server/background/tasks/process_running_jobs.py +123 -15
dstack/_internal/server/background/tasks/process_runs.py +23 -7
dstack/_internal/server/background/tasks/process_submitted_jobs.py +36 -7
dstack/_internal/server/background/tasks/process_terminating_jobs.py +5 -3
dstack/_internal/server/background/tasks/process_volumes.py +2 -2
dstack/_internal/server/migrations/versions/5f1707c525d2_add_filearchivemodel.py +39 -0
dstack/_internal/server/migrations/versions/644b8a114187_add_secretmodel.py +49 -0
dstack/_internal/server/models.py +33 -0
dstack/_internal/server/routers/files.py +67 -0
dstack/_internal/server/routers/secrets.py +57 -15
dstack/_internal/server/schemas/files.py +5 -0
dstack/_internal/server/schemas/runner.py +2 -0
dstack/_internal/server/schemas/secrets.py +7 -11
dstack/_internal/server/services/backends/__init__.py +1 -1
dstack/_internal/server/services/files.py +91 -0
dstack/_internal/server/services/fleets.py +5 -4
dstack/_internal/server/services/gateways/__init__.py +4 -2
dstack/_internal/server/services/jobs/__init__.py +19 -8
dstack/_internal/server/services/jobs/configurators/base.py +25 -3
dstack/_internal/server/services/jobs/configurators/dev.py +3 -3
dstack/_internal/server/services/locking.py +101 -12
dstack/_internal/server/services/proxy/repo.py +3 -0
dstack/_internal/server/services/runner/client.py +8 -0
dstack/_internal/server/services/runs.py +76 -47
dstack/_internal/server/services/secrets.py +204 -0
dstack/_internal/server/services/storage/base.py +21 -0
dstack/_internal/server/services/storage/gcs.py +28 -6
dstack/_internal/server/services/storage/s3.py +27 -9
dstack/_internal/server/services/volumes.py +2 -2
dstack/_internal/server/settings.py +19 -5
dstack/_internal/server/statics/index.html +1 -1
dstack/_internal/server/statics/{main-a4eafa74304e587d037c.js → main-d1ac2e8c38ed5f08a114.js} +86 -34
dstack/_internal/server/statics/{main-a4eafa74304e587d037c.js.map → main-d1ac2e8c38ed5f08a114.js.map} +1 -1
dstack/_internal/server/statics/{main-f53d6d0d42f8d61df1de.css → main-d58fc0460cb0eae7cb5c.css} +1 -1
dstack/_internal/server/statics/static/media/google.b194b06fafd0a52aeb566922160ea514.svg +1 -0
dstack/_internal/server/testing/common.py +50 -8
dstack/_internal/settings.py +4 -0
dstack/_internal/utils/files.py +69 -0
dstack/_internal/utils/nested_list.py +47 -0
dstack/_internal/utils/path.py +12 -4
dstack/api/_public/runs.py +67 -7
dstack/api/server/__init__.py +6 -0
dstack/api/server/_files.py +18 -0
dstack/api/server/_secrets.py +15 -15
dstack/version.py +1 -1
{dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/METADATA +13 -13
{dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/RECORD +80 -67
{dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/WHEEL +0 -0
{dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/entry_points.txt +0 -0
{dstack-0.19.16.dist-info → dstack-0.19.18.dist-info}/licenses/LICENSE.md +0 -0

dstack/_internal/core/compatibility/logs.py CHANGED Viewed

@@ -1,15 +1,16 @@
-from typing import Dict, Optional
+from typing import Optional
+from dstack._internal.core.models.common import IncludeExcludeDictType
 from dstack._internal.server.schemas.logs import PollLogsRequest
-def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[Dict]:
+def get_poll_logs_excludes(request: PollLogsRequest) -> Optional[IncludeExcludeDictType]:
     """
     Returns exclude mapping to exclude certain fields from the request.
     Use this method to exclude new fields when they are not set to keep
     clients backward-compatibility with older servers.
     """
-    excludes = {}
+    excludes: IncludeExcludeDictType = {}
     if request.next_token is None:
         excludes["next_token"] = True
     return excludes if excludes else None

dstack/_internal/core/compatibility/runs.py CHANGED Viewed

@@ -1,31 +1,35 @@
-from typing import Any, Dict, Optional
+from typing import Optional
+from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType
 from dstack._internal.core.models.configurations import ServiceConfiguration
-from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSubmission, RunSpec
+from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSpec, JobSubmission, RunSpec
 from dstack._internal.server.schemas.runs import GetRunPlanRequest
-def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
+def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeDictType]:
     """
     Returns `plan` exclude mapping to exclude certain fields from the request.
     Use this method to exclude new fields when they are not set to keep
     clients backward-compatibility with older servers.
     """
-    apply_plan_excludes = {}
+    apply_plan_excludes: IncludeExcludeDictType = {}
     run_spec_excludes = get_run_spec_excludes(plan.run_spec)
     if run_spec_excludes is not None:
         apply_plan_excludes["run_spec"] = run_spec_excludes
     current_resource = plan.current_resource
     if current_resource is not None:
-        current_resource_excludes = {}
+        current_resource_excludes: IncludeExcludeDictType = {}
         current_resource_excludes["status_message"] = True
         if current_resource.deployment_num == 0:
             current_resource_excludes["deployment_num"] = True
         apply_plan_excludes["current_resource"] = current_resource_excludes
         current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
-        job_submissions_excludes = {}
+        job_submissions_excludes: IncludeExcludeDictType = {}
         current_resource_excludes["jobs"] = {
-            "__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
+            "__all__": {
+                "job_spec": get_job_spec_excludes([job.job_spec for job in current_resource.jobs]),
+                "job_submissions": {"__all__": job_submissions_excludes},
+            }
         }
         job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
         if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
@@ -42,7 +46,7 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
             job_submissions_excludes["deployment_num"] = True
         latest_job_submission = current_resource.latest_job_submission
         if latest_job_submission is not None:
-            latest_job_submission_excludes = {}
+            latest_job_submission_excludes: IncludeExcludeDictType = {}
             current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
             if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
                 latest_job_submission_excludes["job_provisioning_data"] = {
@@ -59,12 +63,12 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
     return {"plan": apply_plan_excludes}
-def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
+def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[IncludeExcludeDictType]:
     """
     Excludes new fields when they are not set to keep
     clients backward-compatibility with older servers.
     """
-    get_plan_excludes = {}
+    get_plan_excludes: IncludeExcludeDictType = {}
     run_spec_excludes = get_run_spec_excludes(request.run_spec)
     if run_spec_excludes is not None:
         get_plan_excludes["run_spec"] = run_spec_excludes
@@ -73,15 +77,15 @@ def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
     return get_plan_excludes
-def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
+def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
     """
     Returns `run_spec` exclude mapping to exclude certain fields from the request.
     Use this method to exclude new fields when they are not set to keep
     clients backward-compatibility with older servers.
     """
-    spec_excludes: dict[str, Any] = {}
-    configuration_excludes: dict[str, Any] = {}
-    profile_excludes: set[str] = set()
+    spec_excludes: IncludeExcludeDictType = {}
+    configuration_excludes: IncludeExcludeDictType = {}
+    profile_excludes: IncludeExcludeSetType = set()
     configuration = run_spec.configuration
     profile = run_spec.profile
@@ -109,14 +113,34 @@ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
         configuration_excludes["stop_criteria"] = True
     if profile is not None and profile.stop_criteria is None:
         profile_excludes.add("stop_criteria")
+    if not configuration.files:
+        configuration_excludes["files"] = True
+    if not run_spec.file_archives:
+        spec_excludes["file_archives"] = True
     if configuration_excludes:
         spec_excludes["configuration"] = configuration_excludes
     if profile_excludes:
         spec_excludes["profile"] = profile_excludes
-    if spec_excludes:
-        return spec_excludes
-    return None
+    return spec_excludes
+def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
+    """
+    Returns `job_spec` exclude mapping to exclude certain fields from the request.
+    Use this method to exclude new fields when they are not set to keep
+    clients backward-compatibility with older servers.
+    """
+    spec_excludes: IncludeExcludeDictType = {}
+    if all(s.repo_code_hash is None for s in job_specs):
+        spec_excludes["repo_code_hash"] = True
+    if all(s.repo_data is None for s in job_specs):
+        spec_excludes["repo_data"] = True
+    if all(not s.file_archives for s in job_specs):
+        spec_excludes["file_archives"] = True
+    return spec_excludes
 def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:

dstack/_internal/core/compatibility/volumes.py CHANGED Viewed

@@ -1,32 +1,33 @@
-from typing import Dict
+from dstack._internal.core.models.common import IncludeExcludeDictType
 from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeSpec
-def get_volume_spec_excludes(volume_spec: VolumeSpec) -> Dict:
+def get_volume_spec_excludes(volume_spec: VolumeSpec) -> IncludeExcludeDictType:
     """
     Returns `volume_spec` exclude mapping to exclude certain fields from the request.
     Use this method to exclude new fields when they are not set to keep
     clients backward-compatibility with older servers.
     """
-    spec_excludes = {}
+    spec_excludes: IncludeExcludeDictType = {}
     spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration)
     return spec_excludes
-def get_create_volume_excludes(configuration: VolumeConfiguration) -> Dict:
+def get_create_volume_excludes(configuration: VolumeConfiguration) -> IncludeExcludeDictType:
     """
     Returns an exclude mapping to exclude certain fields from the create volume request.
     Use this method to exclude new fields when they are not set to keep
     clients backward-compatibility with older servers.
     """
-    create_volume_excludes = {}
+    create_volume_excludes: IncludeExcludeDictType = {}
     create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration)
     return create_volume_excludes
-def _get_volume_configuration_excludes(configuration: VolumeConfiguration) -> Dict:
-    configuration_excludes = {}
+def _get_volume_configuration_excludes(
+    configuration: VolumeConfiguration,
+) -> IncludeExcludeDictType:
+    configuration_excludes: IncludeExcludeDictType = {}
     if configuration.tags is None:
         configuration_excludes["tags"] = True
     return configuration_excludes

dstack/_internal/core/errors.py CHANGED Viewed

@@ -110,6 +110,10 @@ class PlacementGroupInUseError(ComputeError):
     pass
+class PlacementGroupNotSupportedError(ComputeError):
+    pass
 class NotYetTerminated(ComputeError):
     """
     Used by Compute.terminate_instance to signal that instance termination is not complete

dstack/_internal/core/models/common.py CHANGED Viewed

@@ -6,6 +6,13 @@ from pydantic import Field
 from pydantic_duality import DualBaseModel
 from typing_extensions import Annotated
+IncludeExcludeFieldType = Union[int, str]
+IncludeExcludeSetType = set[IncludeExcludeFieldType]
+IncludeExcludeDictType = dict[
+    IncludeExcludeFieldType, Union[bool, IncludeExcludeSetType, "IncludeExcludeDictType"]
+]
+IncludeExcludeType = Union[IncludeExcludeSetType, IncludeExcludeDictType]
 # DualBaseModel creates two classes for the model:
 # one with extra = "forbid" (CoreModel/CoreModel.__request__),

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing_extensions import Annotated, Literal
 from dstack._internal.core.errors import ConfigurationError
 from dstack._internal.core.models.common import CoreModel, Duration, RegistryAuth
 from dstack._internal.core.models.envs import Env
+from dstack._internal.core.models.files import FilePathMapping
 from dstack._internal.core.models.fleets import FleetConfiguration
 from dstack._internal.core.models.gateways import GatewayConfiguration
 from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
@@ -252,6 +253,10 @@ class BaseRunConfiguration(CoreModel):
             description="Use Docker inside the container. Mutually exclusive with `image`, `python`, and `nvcc`. Overrides `privileged`"
         ),
     ] = None
+    files: Annotated[
+        list[Union[FilePathMapping, str]],
+        Field(description="The local to container file path mappings"),
+    ] = []
     # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
     setup: CommandsList = []
@@ -285,6 +290,12 @@ class BaseRunConfiguration(CoreModel):
             return parse_mount_point(v)
         return v
+    @validator("files", each_item=True)
+    def convert_files(cls, v) -> FilePathMapping:
+        if isinstance(v, str):
+            return FilePathMapping.parse(v)
+        return v
     @validator("user")
     def validate_user(cls, v) -> Optional[str]:
         if v is None:

dstack/_internal/core/models/files.py ADDED Viewed

@@ -0,0 +1,67 @@
+import pathlib
+import string
+from uuid import UUID
+from pydantic import Field, validator
+from typing_extensions import Annotated, Self
+from dstack._internal.core.models.common import CoreModel
+class FileArchive(CoreModel):
+    id: UUID
+    hash: str
+class FilePathMapping(CoreModel):
+    local_path: Annotated[
+        str,
+        Field(
+            description=(
+                "The path on the user's machine. Relative paths are resolved relative to"
+                " the parent directory of the the configuration file"
+            )
+        ),
+    ]
+    path: Annotated[
+        str,
+        Field(
+            description=(
+                "The path in the container. Relative paths are resolved relative to"
+                " the repo directory (`/workflow`)"
+            )
+        ),
+    ]
+    @classmethod
+    def parse(cls, v: str) -> Self:
+        local_path: str
+        path: str
+        parts = v.split(":")
+        # A special case for Windows paths, e.g., `C:\path\to`, 'c:/path/to'
+        if (
+            len(parts) > 1
+            and len(parts[0]) == 1
+            and parts[0] in string.ascii_letters
+            and parts[1][:1] in ["\\", "/"]
+        ):
+            parts = [f"{parts[0]}:{parts[1]}", *parts[2:]]
+        if len(parts) == 1:
+            local_path = path = parts[0]
+        elif len(parts) == 2:
+            local_path, path = parts
+        else:
+            raise ValueError(f"invalid file path mapping: {v}")
+        return cls(local_path=local_path, path=path)
+    @validator("path")
+    def validate_path(cls, v) -> str:
+        # True for `C:/.*`, False otherwise, including `/abs/unix/path`, `rel\windows\path`, etc.
+        if pathlib.PureWindowsPath(v).is_absolute():
+            raise ValueError(f"path must be a Unix file path: {v}")
+        return v
+class FileArchiveMapping(CoreModel):
+    id: Annotated[UUID, Field(description="The File archive ID")]
+    path: Annotated[str, Field(description="The path in the container")]

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -12,6 +12,7 @@ from dstack._internal.core.models.configurations import (
     AnyRunConfiguration,
     RunConfiguration,
 )
+from dstack._internal.core.models.files import FileArchiveMapping
 from dstack._internal.core.models.instances import (
     InstanceOfferWithAvailability,
     InstanceType,
@@ -217,6 +218,15 @@ class JobSpec(CoreModel):
     volumes: Optional[List[MountPoint]] = None
     ssh_key: Optional[JobSSHKey] = None
     working_dir: Optional[str]
+    # `repo_data` is optional for client compatibility with pre-0.19.17 servers and for compatibility
+    # with jobs submitted before 0.19.17. All new jobs are expected to have non-None `repo_data`.
+    # For --no-repo runs, `repo_data` is `VirtualRunRepoData()`.
+    repo_data: Annotated[Optional[AnyRunRepoData], Field(discriminator="repo_type")] = None
+    # `repo_code_hash` can be None because it is not used for the repo or because the job was
+    # submitted before 0.19.17. See `_get_repo_code_hash` on how to get the correct `repo_code_hash`
+    # TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
+    repo_code_hash: Optional[str] = None
+    file_archives: list[FileArchiveMapping] = []
 class JobProvisioningData(CoreModel):
@@ -413,6 +423,10 @@ class RunSpec(CoreModel):
         Optional[str],
         Field(description="The hash of the repo diff. Can be omitted if there is no repo diff."),
     ] = None
+    file_archives: Annotated[
+        list[FileArchiveMapping],
+        Field(description="The list of file archive ID to container path mappings"),
+    ] = []
     working_dir: Annotated[
         Optional[str],
         Field(

dstack/_internal/core/models/secrets.py CHANGED Viewed

@@ -1,9 +1,16 @@
+from typing import Optional
+from uuid import UUID
 from dstack._internal.core.models.common import CoreModel
 class Secret(CoreModel):
+    id: UUID
     name: str
-    value: str
+    value: Optional[str] = None
     def __str__(self) -> str:
-        return f'Secret(name="{self.name}", value={"*" * len(self.value)})'
+        displayed_value = "*"
+        if self.value is not None:
+            displayed_value = "*" * len(self.value)
+        return f'Secret(name="{self.name}", value={displayed_value})'

dstack/_internal/core/services/diff.py CHANGED Viewed

@@ -1,14 +1,47 @@
-from typing import Any, Dict
+from typing import Any, Optional, TypedDict
 from pydantic import BaseModel
+from dstack._internal.core.models.common import IncludeExcludeType
+class ModelFieldDiff(TypedDict):
+    old: Any
+    new: Any
+ModelDiff = dict[str, ModelFieldDiff]
 # TODO: calculate nested diffs
-def diff_models(old: BaseModel, new: BaseModel) -> Dict[str, Any]:
+def diff_models(
+    old: BaseModel, new: BaseModel, ignore: Optional[IncludeExcludeType] = None
+) -> ModelDiff:
+    """
+    Returns a diff of model instances fields.
+    NOTE: `ignore` is implemented as `BaseModel.parse_obj(BaseModel.dict(exclude=ignore))`,
+    that is, the "ignored" fields are actually not ignored but reset to the default values
+    before comparison, meaning that 1) any field in `ignore` must have a default value,
+    2) the default value must be equal to itself (e.g. `math.nan` != `math.nan`).
+    Args:
+        old: The "old" model instance.
+        new: The "new" model instance.
+        ignore: Optional fields to ignore.
+    Returns:
+        A dict of changed fields in the form of
+        `{<field_name>: {"old": old_value, "new": new_value}}`
+    """
     if type(old) is not type(new):
         raise TypeError("Both instances must be of the same Pydantic model class.")
-    changes = {}
+    if ignore is not None:
+        old = type(old).parse_obj(old.dict(exclude=ignore))
+        new = type(new).parse_obj(new.dict(exclude=ignore))
+    changes: ModelDiff = {}
     for field in old.__fields__:
         old_value = getattr(old, field)
         new_value = getattr(new, field)

dstack/_internal/server/app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import importlib.resources
 import os
 import time
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Awaitable, Callable, List
@@ -23,6 +24,7 @@ from dstack._internal.server.background import start_background_tasks
 from dstack._internal.server.db import get_db, get_session_ctx, migrate
 from dstack._internal.server.routers import (
     backends,
+    files,
     fleets,
     gateways,
     instances,
@@ -96,6 +98,8 @@ def create_app() -> FastAPI:
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     configure_logging()
+    server_executor = ThreadPoolExecutor(max_workers=settings.SERVER_EXECUTOR_MAX_WORKERS)
+    asyncio.get_running_loop().set_default_executor(server_executor)
     await migrate()
     _print_dstack_logo()
     if not check_required_ssh_version():
@@ -197,6 +201,7 @@ def register_routes(app: FastAPI, ui: bool = True):
     app.include_router(service_proxy.router, prefix="/proxy/services", tags=["service-proxy"])
     app.include_router(model_proxy.router, prefix="/proxy/models", tags=["model-proxy"])
     app.include_router(prometheus.router)
+    app.include_router(files.router)
     @app.exception_handler(ForbiddenError)
     async def forbidden_error_handler(request: Request, exc: ForbiddenError):
@@ -240,6 +245,23 @@ def register_routes(app: FastAPI, ui: bool = True):
         )
         return response
+    if settings.SERVER_PROFILING_ENABLED:
+        from pyinstrument import Profiler
+        @app.middleware("http")
+        async def profile_request(request: Request, call_next):
+            profiling = request.query_params.get("profile", False)
+            if profiling:
+                profiler = Profiler()
+                profiler.start()
+                respone = await call_next(request)
+                profiler.stop()
+                with open("profiling_results.html", "w+") as f:
+                    f.write(profiler.output_html())
+                return respone
+            else:
+                return await call_next(request)
     # this middleware must be defined after the log_request middleware
     @app.middleware("http")
     async def log_http_metrics(request: Request, call_next):

dstack/_internal/server/background/__init__.py CHANGED Viewed

@@ -37,15 +37,31 @@ def get_scheduler() -> AsyncIOScheduler:
 def start_background_tasks() -> AsyncIOScheduler:
+    # We try to process as many resources as possible without exhausting DB connections.
+    #
+    # Quick tasks can process multiple resources per transaction.
+    # Potentially long tasks process one resource per transaction
+    # to avoid holding locks for all the resources if one is slow to process.
+    # Still, the next batch won't be processed unless all resources are processed,
+    # so larger batches do not increase processing rate linearly.
+    #
+    # The interval, batch_size, and max_instances determine background tasks processing rates.
+    # By default, one server replica can handle:
+    #
+    # * 150 active jobs with 2 minutes processing latency
+    # * 150 active runs with 2 minutes processing latency
+    # * 150 active instances with 2 minutes processing latency
+    #
+    # These latency numbers do not account for provisioning time,
+    # so it may be slower if a backend is slow to provision.
+    #
+    # Users can set SERVER_BACKGROUND_PROCESSING_FACTOR to process more resources per replica.
+    # They also need to increase max db connections on the client side and db side.
+    #
     # In-memory locking via locksets does not guarantee
     # that the first waiting for the lock will acquire it.
     # The jitter is needed to give all tasks a chance to acquire locks.
-    # The batch_size and interval determine background tasks processing rates.
-    # Currently one server replica can handle:
-    # * 150 active jobs with up to 2 minutes processing latency
-    # * 150 active runs with up to 2 minutes processing latency
-    # * 150 active instances with up to 2 minutes processing latency
     _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
     _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
     if settings.ENABLE_PROMETHEUS_METRICS:
@@ -53,38 +69,6 @@ def start_background_tasks() -> AsyncIOScheduler:
             collect_prometheus_metrics, IntervalTrigger(seconds=10), max_instances=1
         )
         _scheduler.add_job(delete_prometheus_metrics, IntervalTrigger(minutes=5), max_instances=1)
-    # process_submitted_jobs and process_instances max processing rate is 75 jobs(instances) per minute.
-    _scheduler.add_job(
-        process_submitted_jobs,
-        IntervalTrigger(seconds=4, jitter=2),
-        kwargs={"batch_size": 5},
-        max_instances=2,
-    )
-    _scheduler.add_job(
-        process_running_jobs,
-        IntervalTrigger(seconds=4, jitter=2),
-        kwargs={"batch_size": 5},
-        max_instances=2,
-    )
-    _scheduler.add_job(
-        process_terminating_jobs,
-        IntervalTrigger(seconds=4, jitter=2),
-        kwargs={"batch_size": 5},
-        max_instances=2,
-    )
-    _scheduler.add_job(
-        process_runs,
-        IntervalTrigger(seconds=2, jitter=1),
-        kwargs={"batch_size": 5},
-        max_instances=2,
-    )
-    _scheduler.add_job(
-        process_instances,
-        IntervalTrigger(seconds=4, jitter=2),
-        kwargs={"batch_size": 5},
-        max_instances=2,
-    )
-    _scheduler.add_job(process_fleets, IntervalTrigger(seconds=10, jitter=2))
     _scheduler.add_job(process_gateways_connections, IntervalTrigger(seconds=15))
     _scheduler.add_job(
         process_submitted_gateways, IntervalTrigger(seconds=10, jitter=2), max_instances=5
@@ -93,5 +77,45 @@ def start_background_tasks() -> AsyncIOScheduler:
         process_submitted_volumes, IntervalTrigger(seconds=10, jitter=2), max_instances=5
     )
     _scheduler.add_job(process_placement_groups, IntervalTrigger(seconds=30, jitter=5))
+    for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
+        # Add multiple copies of tasks if requested.
+        # max_instances=1 for additional copies to avoid running too many tasks.
+        # Move other tasks here when they need per-replica scaling.
+        _scheduler.add_job(
+            process_submitted_jobs,
+            IntervalTrigger(seconds=4, jitter=2),
+            kwargs={"batch_size": 5},
+            max_instances=4 if replica == 0 else 1,
+        )
+        _scheduler.add_job(
+            process_running_jobs,
+            IntervalTrigger(seconds=4, jitter=2),
+            kwargs={"batch_size": 5},
+            max_instances=2 if replica == 0 else 1,
+        )
+        _scheduler.add_job(
+            process_terminating_jobs,
+            IntervalTrigger(seconds=4, jitter=2),
+            kwargs={"batch_size": 5},
+            max_instances=2 if replica == 0 else 1,
+        )
+        _scheduler.add_job(
+            process_runs,
+            IntervalTrigger(seconds=2, jitter=1),
+            kwargs={"batch_size": 5},
+            max_instances=2 if replica == 0 else 1,
+        )
+        _scheduler.add_job(
+            process_instances,
+            IntervalTrigger(seconds=4, jitter=2),
+            kwargs={"batch_size": 5},
+            max_instances=2 if replica == 0 else 1,
+        )
+        _scheduler.add_job(
+            process_fleets,
+            IntervalTrigger(seconds=10, jitter=2),
+            kwargs={"batch_size": 5},
+            max_instances=2 if replica == 0 else 1,
+        )
     _scheduler.start()
     return _scheduler

dstack/_internal/server/background/tasks/process_fleets.py CHANGED Viewed

@@ -1,9 +1,12 @@
+import asyncio
+from datetime import timedelta
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
 from dstack._internal.core.models.fleets import FleetStatus
-from dstack._internal.server.db import get_session_ctx
+from dstack._internal.server.db import get_db, get_session_ctx
 from dstack._internal.server.models import FleetModel
 from dstack._internal.server.services.fleets import (
     is_fleet_empty,
@@ -17,8 +20,18 @@ from dstack._internal.utils.logging import get_logger
 logger = get_logger(__name__)
-async def process_fleets():
-    lock, lockset = get_locker().get_lockset(FleetModel.__tablename__)
+MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
+async def process_fleets(batch_size: int = 1):
+    tasks = []
+    for _ in range(batch_size):
+        tasks.append(_process_next_fleet())
+    await asyncio.gather(*tasks)
+async def _process_next_fleet():
+    lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
     async with get_session_ctx() as session:
         async with lock:
             res = await session.execute(
@@ -26,6 +39,8 @@ async def process_fleets():
                 .where(
                     FleetModel.deleted == False,
                     FleetModel.id.not_in(lockset),
+                    FleetModel.last_processed_at
+                    < get_current_datetime().replace(tzinfo=None) - MIN_PROCESSING_INTERVAL,
                 )
                 .order_by(FleetModel.last_processed_at.asc())
                 .limit(1)
@@ -43,6 +58,7 @@ async def process_fleets():
 async def _process_fleet(session: AsyncSession, fleet_model: FleetModel):
+    logger.debug("Processing fleet %s", fleet_model.name)
     # Refetch to load related attributes.
     # joinedload produces LEFT OUTER JOIN that can't be used with FOR UPDATE.
     res = await session.execute(

dstack/_internal/server/background/tasks/process_gateways.py CHANGED Viewed

@@ -28,7 +28,7 @@ async def process_gateways_connections():
 async def process_submitted_gateways():
-    lock, lockset = get_locker().get_lockset(GatewayModel.__tablename__)
+    lock, lockset = get_locker(get_db().dialect_name).get_lockset(GatewayModel.__tablename__)
     async with get_session_ctx() as session:
         async with lock:
             res = await session.execute(

dstack 0.19.16__py3-none-any.whl → 0.19.18__py3-none-any.whl

Potentially problematic release.

dstack 0.19.16py3-none-any.whl → 0.19.18py3-none-any.whl