PyPI - dstack - Versions diffs - 0.19.9__py3-none-any.whl → 0.19.11rc1__py3-none-any.whl - Mend

dstack 0.19.9py3-none-any.whl → 0.19.11rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dstack might be problematic. Click here for more details.

Files changed (37) hide show

dstack/_internal/cli/commands/config.py CHANGED Viewed

@@ -14,7 +14,7 @@ logger = get_logger(__name__)
 class ConfigCommand(BaseCommand):
     NAME = "config"
-    DESCRIPTION = "Configure CLI"
+    DESCRIPTION = "Configure CLI (deprecated; use `dstack project`)"
     def _register(self):
         super()._register()

dstack/_internal/cli/commands/metrics.py CHANGED Viewed

@@ -39,8 +39,6 @@ class MetricsCommand(APIBaseCommand):
         run = self.api.runs.get(run_name=args.run_name)
         if run is None:
             raise CLIError(f"Run {args.run_name} not found")
-        if run.status.is_finished():
-            raise CLIError(f"Run {args.run_name} is finished")
         metrics = _get_run_jobs_metrics(api=self.api, run=run)
         if not args.watch:
@@ -55,8 +53,6 @@ class MetricsCommand(APIBaseCommand):
                     run = self.api.runs.get(run_name=args.run_name)
                     if run is None:
                         raise CLIError(f"Run {args.run_name} not found")
-                    if run.status.is_finished():
-                        raise CLIError(f"Run {args.run_name} is finished")
                     metrics = _get_run_jobs_metrics(api=self.api, run=run)
         except KeyboardInterrupt:
             pass
@@ -78,11 +74,12 @@ def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]:
 def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
     table = Table(box=None)
     table.add_column("NAME", style="bold", no_wrap=True)
+    table.add_column("STATUS")
     table.add_column("CPU")
     table.add_column("MEMORY")
     table.add_column("GPU")
-    run_row: Dict[Union[str, int], Any] = {"NAME": run.name}
+    run_row: Dict[Union[str, int], Any] = {"NAME": run.name, "STATUS": run.status.value}
     if len(run._run.jobs) != 1:
         add_row_from_dict(table, run_row)
@@ -101,9 +98,9 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
             cpu_usage = f"{cpu_usage:.0f}%"
         memory_usage = _get_metric_value(job_metrics, "memory_working_set_bytes")
         if memory_usage is not None:
-            memory_usage = f"{round(memory_usage / 1024 / 1024)}MB"
+            memory_usage = _format_memory(memory_usage, 2)
             if resources is not None:
-                memory_usage += f"/{resources.memory_mib}MB"
+                memory_usage += f"/{_format_memory(resources.memory_mib * 1024 * 1024, 2)}"
         gpu_metrics = ""
         gpus_detected_num = _get_metric_value(job_metrics, "gpus_detected_num")
         if gpus_detected_num is not None:
@@ -113,13 +110,16 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
                 if gpu_memory_usage is not None:
                     if i != 0:
                         gpu_metrics += "\n"
-                    gpu_metrics += f"#{i} {round(gpu_memory_usage / 1024 / 1024)}MB"
+                    gpu_metrics += f"gpu={i} mem={_format_memory(gpu_memory_usage, 2)}"
                     if resources is not None:
-                        gpu_metrics += f"/{resources.gpus[i].memory_mib}MB"
-                    gpu_metrics += f" {gpu_util_percent}% Util"
+                        gpu_metrics += (
+                            f"/{_format_memory(resources.gpus[i].memory_mib * 1024 * 1024, 2)}"
+                        )
+                    gpu_metrics += f" util={gpu_util_percent}%"
         job_row: Dict[Union[str, int], Any] = {
             "NAME": f"  replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
+            "STATUS": job.job_submissions[-1].status.value,
             "CPU": cpu_usage or "-",
             "MEMORY": memory_usage or "-",
             "GPU": gpu_metrics or "-",
@@ -136,3 +136,18 @@ def _get_metric_value(job_metrics: JobMetrics, name: str) -> Optional[Any]:
         if metric.name == name:
             return metric.values[-1]
     return None
+def _format_memory(memory_bytes: int, decimal_places: int) -> str:
+    """See test_format_memory in tests/_internal/cli/commands/test_metrics.py for examples."""
+    memory_mb = memory_bytes / 1024 / 1024
+    if memory_mb >= 1024:
+        value = memory_mb / 1024
+        unit = "GB"
+    else:
+        value = memory_mb
+        unit = "MB"
+    if decimal_places == 0:
+        return f"{round(value)}{unit}"
+    return f"{value:.{decimal_places}f}".rstrip("0").rstrip(".") + unit

dstack/_internal/cli/commands/project.py ADDED Viewed

@@ -0,0 +1,161 @@
+import argparse
+from requests import HTTPError
+from rich.table import Table
+import dstack.api.server
+from dstack._internal.cli.commands import BaseCommand
+from dstack._internal.cli.utils.common import confirm_ask, console
+from dstack._internal.core.errors import ClientError, CLIError
+from dstack._internal.core.services.configs import ConfigManager
+from dstack._internal.utils.logging import get_logger
+logger = get_logger(__name__)
+class ProjectCommand(BaseCommand):
+    NAME = "project"
+    DESCRIPTION = "Manage projects configs"
+    def _register(self):
+        super()._register()
+        subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
+        # Add subcommand
+        add_parser = subparsers.add_parser("add", help="Add or update a project config")
+        add_parser.add_argument(
+            "--name", type=str, help="The name of the project to configure", required=True
+        )
+        add_parser.add_argument("--url", type=str, help="Server url", required=True)
+        add_parser.add_argument("--token", type=str, help="User token", required=True)
+        add_parser.add_argument(
+            "-y",
+            "--yes",
+            help="Don't ask for confirmation (e.g. update the config)",
+            action="store_true",
+        )
+        add_parser.add_argument(
+            "-n",
+            "--no",
+            help="Don't ask for confirmation (e.g. do not update the config)",
+            action="store_true",
+        )
+        add_parser.set_defaults(subfunc=self._add)
+        # Delete subcommand
+        delete_parser = subparsers.add_parser("delete", help="Delete a project config")
+        delete_parser.add_argument(
+            "--name", type=str, help="The name of the project to delete", required=True
+        )
+        delete_parser.add_argument(
+            "-y",
+            "--yes",
+            help="Don't ask for confirmation",
+            action="store_true",
+        )
+        delete_parser.set_defaults(subfunc=self._delete)
+        # List subcommand
+        list_parser = subparsers.add_parser("list", help="List configured projects")
+        list_parser.set_defaults(subfunc=self._list)
+        # Set default subcommand
+        set_default_parser = subparsers.add_parser("set-default", help="Set default project")
+        set_default_parser.add_argument(
+            "name", type=str, help="The name of the project to set as default"
+        )
+        set_default_parser.set_defaults(subfunc=self._set_default)
+    def _command(self, args: argparse.Namespace):
+        if not hasattr(args, "subfunc"):
+            args.subfunc = self._list
+        args.subfunc(args)
+    def _add(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
+        try:
+            api_client.projects.get(args.name)
+        except HTTPError as e:
+            if e.response.status_code == 403:
+                raise CLIError("Forbidden. Ensure the token is valid.")
+            elif e.response.status_code == 404:
+                raise CLIError(f"Project '{args.name}' not found.")
+            else:
+                raise e
+        default_project = config_manager.get_project_config()
+        if (
+            default_project is None
+            or default_project.name != args.name
+            or default_project.url != args.url
+            or default_project.token != args.token
+        ):
+            set_it_as_default = (
+                (
+                    args.yes
+                    or not default_project
+                    or confirm_ask(f"Set '{args.name}' as your default project?")
+                )
+                if not args.no
+                else False
+            )
+            config_manager.configure_project(
+                name=args.name, url=args.url, token=args.token, default=set_it_as_default
+            )
+            config_manager.save()
+        logger.info(
+            f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
+        )
+    def _delete(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
+            config_manager.delete_project(args.name)
+            config_manager.save()
+            console.print("[grey58]OK[/]")
+    def _list(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        default_project = config_manager.get_project_config()
+        table = Table(box=None)
+        table.add_column("PROJECT", style="bold", no_wrap=True)
+        table.add_column("URL", style="grey58")
+        table.add_column("USER", style="grey58")
+        table.add_column("DEFAULT", justify="center")
+        for project_name in config_manager.list_projects():
+            project_config = config_manager.get_project_config(project_name)
+            is_default = project_name == default_project.name if default_project else False
+            # Get username from API
+            try:
+                api_client = dstack.api.server.APIClient(
+                    base_url=project_config.url, token=project_config.token
+                )
+                user_info = api_client.users.get_my_user()
+                username = user_info.username
+            except ClientError:
+                username = "(invalid token)"
+            table.add_row(
+                project_name,
+                project_config.url,
+                username,
+                "✓" if is_default else "",
+                style="bold" if is_default else None,
+            )
+        console.print(table)
+    def _set_default(self, args: argparse.Namespace):
+        config_manager = ConfigManager()
+        project_config = config_manager.get_project_config(args.name)
+        if project_config is None:
+            raise CLIError(f"Project '{args.name}' not found")
+        config_manager.configure_project(
+            name=args.name, url=project_config.url, token=project_config.token, default=True
+        )
+        config_manager.save()
+        console.print("[grey58]OK[/]")

dstack/_internal/cli/commands/ps.py CHANGED Viewed

@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
             help="Watch statuses of runs in realtime",
             action="store_true",
         )
+        self._parser.add_argument(
+            "-n",
+            "--last",
+            help="Show only the last N runs. Implies --all",
+            type=int,
+            default=None,
+        )
     def _command(self, args: argparse.Namespace):
         super()._command(args)
-        runs = self.api.runs.list(all=args.all)
+        runs = self.api.runs.list(all=args.all, limit=args.last)
         if not args.watch:
             console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
             return
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
                 while True:
                     live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
                     time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
-                    runs = self.api.runs.list(all=args.all)
+                    runs = self.api.runs.list(all=args.all, limit=args.last)
         except KeyboardInterrupt:
             pass

dstack/_internal/cli/main.py CHANGED Viewed

@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
 from dstack._internal.cli.commands.logs import LogsCommand
 from dstack._internal.cli.commands.metrics import MetricsCommand
 from dstack._internal.cli.commands.offer import OfferCommand
+from dstack._internal.cli.commands.project import ProjectCommand
 from dstack._internal.cli.commands.ps import PsCommand
 from dstack._internal.cli.commands.server import ServerCommand
 from dstack._internal.cli.commands.stats import StatsCommand
@@ -69,6 +70,7 @@ def main():
     OfferCommand.register(subparsers)
     LogsCommand.register(subparsers)
     MetricsCommand.register(subparsers)
+    ProjectCommand.register(subparsers)
     PsCommand.register(subparsers)
     ServerCommand.register(subparsers)
     StatsCommand.register(subparsers)

dstack/_internal/core/backends/azure/compute.py CHANGED Viewed

@@ -391,9 +391,9 @@ class VMImageVariant(enum.Enum):
 _SUPPORTED_VM_SERIES_PATTERNS = [
-    r"D(\d+)s_v3",  # Dsv3-series
-    r"E(\d+)i?s_v4",  # Esv4-series
-    r"E(\d+)-(\d+)s_v4",  # Esv4-series (constrained vCPU)
+    r"D(\d+)s_v6",  # Dsv6-series (general purpose)
+    r"E(\d+)i?s_v6",  # Esv6-series (memory optimized)
+    r"F(\d+)s_v2",  # Fsv2-series (compute optimized)
     r"NC(\d+)s_v3",  # NCv3-series [V100 16GB]
     r"NC(\d+)as_T4_v3",  # NCasT4_v3-series [T4]
     r"ND(\d+)rs_v2",  # NDv2-series [8xV100 32GB]
@@ -401,6 +401,11 @@ _SUPPORTED_VM_SERIES_PATTERNS = [
     r"NC(\d+)ads_A100_v4",  # NC A100 v4-series [A100 80GB]
     r"ND(\d+)asr_v4",  # ND A100 v4-series [8xA100 40GB]
     r"ND(\d+)amsr_A100_v4",  # NDm A100 v4-series [8xA100 80GB]
+    # Deprecated series
+    # TODO: Remove after several releases
+    r"D(\d+)s_v3",  # Dsv3-series (general purpose)
+    r"E(\d+)i?s_v4",  # Esv4-series (memory optimized)
+    r"E(\d+)-(\d+)s_v4",  # Esv4-series (constrained vCPU)
 ]
 _SUPPORTED_VM_SERIES_PATTERN = (
     "^Standard_(" + "|".join(f"({s})" for s in _SUPPORTED_VM_SERIES_PATTERNS) + ")$"

dstack/_internal/core/backends/base/compute.py CHANGED Viewed

@@ -19,6 +19,7 @@ from dstack._internal.core.consts import (
     DSTACK_RUNNER_SSH_PORT,
     DSTACK_SHIM_HTTP_PORT,
 )
+from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
 from dstack._internal.core.models.gateways import (
     GatewayComputeConfiguration,
     GatewayProvisioningData,
@@ -754,7 +755,7 @@ def get_docker_commands(
             f" --ssh-port {DSTACK_RUNNER_SSH_PORT}"
             " --temp-dir /tmp/runner"
             " --home-dir /root"
-            " --working-dir /workflow"
+            f" --working-dir {DEFAULT_REPO_DIR}"
         ),
     ]

dstack/_internal/core/models/configurations.py CHANGED Viewed

@@ -23,6 +23,10 @@ ValidPort = conint(gt=0, le=65536)
 MAX_INT64 = 2**63 - 1
 SERVICE_HTTPS_DEFAULT = True
 STRIP_PREFIX_DEFAULT = True
+RUN_PRIOTIRY_MIN = 0
+RUN_PRIOTIRY_MAX = 100
+RUN_PRIORITY_DEFAULT = 0
+DEFAULT_REPO_DIR = "/workflow"
 class RunConfigurationType(str, Enum):
@@ -77,7 +81,8 @@ class ScalingSpec(CoreModel):
         Field(
             description="The target value of the metric. "
             "The number of replicas is calculated based on this number and automatically adjusts "
-            "(scales up or down) as this metric changes"
+            "(scales up or down) as this metric changes",
+            gt=0,
         ),
     ]
     scale_up_delay: Annotated[
@@ -177,7 +182,7 @@ class BaseRunConfiguration(CoreModel):
         Field(
             description=(
                 "The path to the working directory inside the container."
-                " It's specified relative to the repository directory (`/workflow`) and should be inside it."
+                f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
                 ' Defaults to `"."` '
             )
         ),
@@ -221,14 +226,26 @@ class BaseRunConfiguration(CoreModel):
             )
         ),
     ] = None
-    # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
-    setup: CommandsList = []
     resources: Annotated[
         ResourcesSpec, Field(description="The resources requirements to run the configuration")
     ] = ResourcesSpec()
+    priority: Annotated[
+        Optional[int],
+        Field(
+            ge=RUN_PRIOTIRY_MIN,
+            le=RUN_PRIOTIRY_MAX,
+            description=(
+                f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
+                " `dstack` tries to provision runs with higher priority first."
+                f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
+            ),
+        ),
+    ] = None
     volumes: Annotated[
         List[Union[MountPoint, str]], Field(description="The volumes mount points")
     ] = []
+    # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
+    setup: CommandsList = []
     @validator("python", pre=True, always=True)
     def convert_python(cls, v, values) -> Optional[PythonVersion]:

dstack/_internal/core/models/runs.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing_extensions import Annotated
 from dstack._internal.core.models.backends.base import BackendType
 from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
 from dstack._internal.core.models.configurations import (
+    DEFAULT_REPO_DIR,
     AnyRunConfiguration,
     RunConfiguration,
 )
@@ -338,7 +339,7 @@ class RunSpec(CoreModel):
         Field(
             description=(
                 "The path to the working directory inside the container."
-                " It's specified relative to the repository directory (`/workflow`) and should be inside it."
+                f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
                 ' Defaults to `"."`.'
             )
         ),

dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf CHANGED Viewed

@@ -1 +1,11 @@
-log_format dstack_stat '$time_iso8601 $host $status $request_time';
+log_format dstack_stat '$time_iso8601 $host $status $request_time $dstack_replica_hit';
+# A hack to avoid this Nginx reload error when no services are registered:
+#     nginx: [emerg] unknown "dstack_replica_hit" variable
+server {
+    listen unix:/tmp/dstack-dummy-nginx.sock;
+    server_name placeholder.local;
+    deny all;
+    set $dstack_replica_hit 0;
+}

dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 CHANGED Viewed

@@ -14,6 +14,7 @@ upstream {{ domain }}.upstream {
 server {
     server_name {{ domain }};
     limit_req_status 429;
+    set $dstack_replica_hit 0;
     access_log {{ access_log_path }} dstack_stat;
     client_max_body_size {{ client_max_body_size }};
@@ -23,11 +24,7 @@ server {
         auth_request /_dstack_auth;
         {% endif %}
-        {% if replicas %}
         try_files /nonexistent @$http_upgrade;
-        {% else %}
-        return 503;
-        {% endif %}
         {% if location.limit_req %}
         limit_req zone={{ location.limit_req.zone }}{% if location.limit_req.burst %} burst={{ location.limit_req.burst }} nodelay{% endif %};
@@ -35,8 +32,9 @@ server {
     }
     {% endfor %}
-    {% if replicas %}
     location @websocket {
+        set $dstack_replica_hit 1;
+        {% if replicas %}
         proxy_pass http://{{ domain }}.upstream;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_set_header Host $host;
@@ -44,19 +42,27 @@ server {
         proxy_set_header Upgrade $http_upgrade;
         proxy_set_header Connection "Upgrade";
         proxy_read_timeout 300s;
+        {% else %}
+        return 503;
+        {% endif %}
     }
     location @ {
+        set $dstack_replica_hit 1;
+        {% if replicas %}
         proxy_pass http://{{ domain }}.upstream;
         proxy_set_header X-Real-IP $remote_addr;
         proxy_set_header Host $host;
         proxy_read_timeout 300s;
+        {% else %}
+        return 503;
+        {% endif %}
     }
-    {% endif %}
     {% if auth %}
     location = /_dstack_auth {
         internal;
         if ($remote_addr = 127.0.0.1) {
+            # for requests from the gateway app, e.g. from the OpenAI-compatible API
             return 200;
         }
         proxy_pass http://localhost:{{ proxy_port }}/api/auth/{{ project_name }};

dstack/_internal/proxy/gateway/services/stats.py CHANGED Viewed

@@ -11,10 +11,10 @@ from pydantic import BaseModel
 from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo
 from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, ServiceStats, Stat
+from dstack._internal.proxy.lib.errors import UnexpectedProxyError
 from dstack._internal.utils.common import run_async
 logger = logging.getLogger(__name__)
-IGNORE_STATUSES = {403, 404}
 WINDOWS = (30, 60, 300)
 TTL = WINDOWS[-1]
 EMPTY_STATS = {window: Stat(requests=0, request_time=0.0) for window in WINDOWS}
@@ -35,6 +35,7 @@ class LogEntry(BaseModel):
     host: str
     status: int
     request_time: float
+    is_replica_hit: bool
 class StatsCollector:
@@ -87,7 +88,8 @@ class StatsCollector:
         now = datetime.datetime.now(tz=datetime.timezone.utc)
         for entry in self._read_access_log(now - datetime.timedelta(seconds=TTL)):
-            if entry.status in IGNORE_STATUSES:
+            # only include requests that hit or should hit a service replica
+            if not entry.is_replica_hit:
                 continue
             frame_timestamp = int(entry.timestamp.timestamp())
@@ -119,7 +121,10 @@ class StatsCollector:
                 line = self._file.readline()
                 if not line:
                     break
-                timestamp_str, host, status, request_time = line.split()
+                cells = line.split()
+                if len(cells) == 4:  # compatibility with pre-0.19.11 logs
+                    cells.append("0" if cells[2] in ["403", "404"] else "1")
+                timestamp_str, host, status, request_time, dstack_replica_hit = cells
                 timestamp = datetime.datetime.fromisoformat(timestamp_str)
                 if timestamp < after:
                     continue
@@ -128,6 +133,7 @@ class StatsCollector:
                     host=host,
                     status=int(status),
                     request_time=float(request_time),
+                    is_replica_hit=_parse_nginx_bool(dstack_replica_hit),
                 )
             if os.fstat(self._file.fileno()).st_ino != st_ino:
                 # file was rotated
@@ -154,3 +160,11 @@ async def get_service_stats(
         )
         for service in services
     ]
+def _parse_nginx_bool(v: str) -> bool:
+    if v == "0":
+        return False
+    if v == "1":
+        return True
+    raise UnexpectedProxyError(f"Cannot parse boolean value: expected '0' or '1', got {v!r}")

dstack/_internal/server/background/tasks/process_metrics.py CHANGED Viewed

@@ -2,7 +2,7 @@ import asyncio
 import json
 from typing import Dict, List, Optional
-from sqlalchemy import delete, select
+from sqlalchemy import Delete, delete, select
 from sqlalchemy.orm import joinedload
 from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
@@ -49,27 +49,29 @@ async def delete_metrics():
     finished_timestamp_micro_cutoff = (
         now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
     )
+    await asyncio.gather(
+        _execute_delete_statement(
+            delete(JobMetricsPoint).where(
+                JobMetricsPoint.job_id.in_(
+                    select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
+                ),
+                JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
+            )
+        ),
+        _execute_delete_statement(
+            delete(JobMetricsPoint).where(
+                JobMetricsPoint.job_id.in_(
+                    select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
+                ),
+                JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
+            )
+        ),
+    )
+async def _execute_delete_statement(stmt: Delete) -> None:
     async with get_session_ctx() as session:
-        await asyncio.gather(
-            session.execute(
-                delete(JobMetricsPoint).where(
-                    JobMetricsPoint.job_id.in_(
-                        select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
-                    ),
-                    JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
-                )
-            ),
-            session.execute(
-                delete(JobMetricsPoint).where(
-                    JobMetricsPoint.job_id.in_(
-                        select(JobModel.id).where(
-                            JobModel.status.in_(JobStatus.finished_statuses())
-                        )
-                    ),
-                    JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
-                )
-            ),
-        )
+        await session.execute(stmt)
         await session.commit()

dstack/_internal/server/background/tasks/process_submitted_jobs.py CHANGED Viewed

@@ -93,11 +93,20 @@ async def _process_next_submitted_job():
         async with lock:
             res = await session.execute(
                 select(JobModel)
+                .join(JobModel.run)
                 .where(
                     JobModel.status == JobStatus.SUBMITTED,
                     JobModel.id.not_in(lockset),
                 )
-                .order_by(JobModel.last_processed_at.asc())
+                # Jobs are process in FIFO sorted by priority globally,
+                # thus runs from different project can "overtake" each other by using higher priorities.
+                # That's not a big problem as long as projects do not compete for the same compute resources.
+                # Jobs with lower priorities from other projects will be processed without major lag
+                # as long as new higher priority runs are not constantly submitted.
+                # TODO: Consider processing jobs from different projects fairly/round-robin
+                # Fully fair processing can be tricky to implement via the current DB queue as
+                # there can be many projects and we are limited by the max DB connections.
+                .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
                 .limit(1)
                 .with_for_update(skip_locked=True)
             )
@@ -360,16 +369,16 @@ async def _assign_job_to_pool_instance(
         (instance, common_utils.get_or_error(get_instance_offer(instance)))
         for instance in nonshared_instances
     ]
-    if not multinode:
-        shared_instances_with_offers = get_shared_pool_instances_with_offers(
-            pool_instances=pool_instances,
-            profile=profile,
-            requirements=job.job_spec.requirements,
-            idle_only=True,
-            fleet_model=fleet_model,
-            volumes=volumes,
-        )
-        instances_with_offers.extend(shared_instances_with_offers)
+    shared_instances_with_offers = get_shared_pool_instances_with_offers(
+        pool_instances=pool_instances,
+        profile=profile,
+        requirements=job.job_spec.requirements,
+        idle_only=True,
+        fleet_model=fleet_model,
+        multinode=multinode,
+        volumes=volumes,
+    )
+    instances_with_offers.extend(shared_instances_with_offers)
     if len(instances_with_offers) == 0:
         return None
@@ -572,7 +581,7 @@ def _create_instance_model_for_job(
 def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
-    if offer.total_blocks == 1:
+    if offer.blocks == offer.total_blocks:
         if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
             network_mode = NetworkMode.BRIDGE
         else:

dstack 0.19.9__py3-none-any.whl → 0.19.11rc1__py3-none-any.whl

Potentially problematic release.

dstack 0.19.9py3-none-any.whl → 0.19.11rc1py3-none-any.whl