dstack 0.19.9__py3-none-any.whl → 0.19.11rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (37) hide show
  1. dstack/_internal/cli/commands/config.py +1 -1
  2. dstack/_internal/cli/commands/metrics.py +25 -10
  3. dstack/_internal/cli/commands/project.py +161 -0
  4. dstack/_internal/cli/commands/ps.py +9 -2
  5. dstack/_internal/cli/main.py +2 -0
  6. dstack/_internal/core/backends/azure/compute.py +8 -3
  7. dstack/_internal/core/backends/base/compute.py +2 -1
  8. dstack/_internal/core/models/configurations.py +21 -4
  9. dstack/_internal/core/models/runs.py +2 -1
  10. dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -1
  11. dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +12 -6
  12. dstack/_internal/proxy/gateway/services/stats.py +17 -3
  13. dstack/_internal/server/background/tasks/process_metrics.py +23 -21
  14. dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
  15. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  16. dstack/_internal/server/models.py +1 -0
  17. dstack/_internal/server/routers/repos.py +8 -4
  18. dstack/_internal/server/services/instances.py +6 -2
  19. dstack/_internal/server/services/jobs/configurators/base.py +18 -4
  20. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -1
  21. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -1
  22. dstack/_internal/server/services/runs.py +31 -18
  23. dstack/_internal/server/settings.py +1 -0
  24. dstack/_internal/server/statics/index.html +1 -1
  25. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
  26. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
  27. dstack/_internal/server/testing/common.py +2 -0
  28. dstack/_internal/server/utils/routers.py +3 -6
  29. dstack/_internal/settings.py +4 -0
  30. dstack/api/_public/runs.py +6 -3
  31. dstack/api/server/_runs.py +2 -0
  32. dstack/version.py +2 -2
  33. {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/METADATA +11 -6
  34. {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/RECORD +37 -35
  35. {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/WHEEL +0 -0
  36. {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/entry_points.txt +0 -0
  37. {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -14,7 +14,7 @@ logger = get_logger(__name__)
14
14
 
15
15
  class ConfigCommand(BaseCommand):
16
16
  NAME = "config"
17
- DESCRIPTION = "Configure CLI"
17
+ DESCRIPTION = "Configure CLI (deprecated; use `dstack project`)"
18
18
 
19
19
  def _register(self):
20
20
  super()._register()
@@ -39,8 +39,6 @@ class MetricsCommand(APIBaseCommand):
39
39
  run = self.api.runs.get(run_name=args.run_name)
40
40
  if run is None:
41
41
  raise CLIError(f"Run {args.run_name} not found")
42
- if run.status.is_finished():
43
- raise CLIError(f"Run {args.run_name} is finished")
44
42
  metrics = _get_run_jobs_metrics(api=self.api, run=run)
45
43
 
46
44
  if not args.watch:
@@ -55,8 +53,6 @@ class MetricsCommand(APIBaseCommand):
55
53
  run = self.api.runs.get(run_name=args.run_name)
56
54
  if run is None:
57
55
  raise CLIError(f"Run {args.run_name} not found")
58
- if run.status.is_finished():
59
- raise CLIError(f"Run {args.run_name} is finished")
60
56
  metrics = _get_run_jobs_metrics(api=self.api, run=run)
61
57
  except KeyboardInterrupt:
62
58
  pass
@@ -78,11 +74,12 @@ def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]:
78
74
  def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
79
75
  table = Table(box=None)
80
76
  table.add_column("NAME", style="bold", no_wrap=True)
77
+ table.add_column("STATUS")
81
78
  table.add_column("CPU")
82
79
  table.add_column("MEMORY")
83
80
  table.add_column("GPU")
84
81
 
85
- run_row: Dict[Union[str, int], Any] = {"NAME": run.name}
82
+ run_row: Dict[Union[str, int], Any] = {"NAME": run.name, "STATUS": run.status.value}
86
83
  if len(run._run.jobs) != 1:
87
84
  add_row_from_dict(table, run_row)
88
85
 
@@ -101,9 +98,9 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
101
98
  cpu_usage = f"{cpu_usage:.0f}%"
102
99
  memory_usage = _get_metric_value(job_metrics, "memory_working_set_bytes")
103
100
  if memory_usage is not None:
104
- memory_usage = f"{round(memory_usage / 1024 / 1024)}MB"
101
+ memory_usage = _format_memory(memory_usage, 2)
105
102
  if resources is not None:
106
- memory_usage += f"/{resources.memory_mib}MB"
103
+ memory_usage += f"/{_format_memory(resources.memory_mib * 1024 * 1024, 2)}"
107
104
  gpu_metrics = ""
108
105
  gpus_detected_num = _get_metric_value(job_metrics, "gpus_detected_num")
109
106
  if gpus_detected_num is not None:
@@ -113,13 +110,16 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
113
110
  if gpu_memory_usage is not None:
114
111
  if i != 0:
115
112
  gpu_metrics += "\n"
116
- gpu_metrics += f"#{i} {round(gpu_memory_usage / 1024 / 1024)}MB"
113
+ gpu_metrics += f"gpu={i} mem={_format_memory(gpu_memory_usage, 2)}"
117
114
  if resources is not None:
118
- gpu_metrics += f"/{resources.gpus[i].memory_mib}MB"
119
- gpu_metrics += f" {gpu_util_percent}% Util"
115
+ gpu_metrics += (
116
+ f"/{_format_memory(resources.gpus[i].memory_mib * 1024 * 1024, 2)}"
117
+ )
118
+ gpu_metrics += f" util={gpu_util_percent}%"
120
119
 
121
120
  job_row: Dict[Union[str, int], Any] = {
122
121
  "NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
122
+ "STATUS": job.job_submissions[-1].status.value,
123
123
  "CPU": cpu_usage or "-",
124
124
  "MEMORY": memory_usage or "-",
125
125
  "GPU": gpu_metrics or "-",
@@ -136,3 +136,18 @@ def _get_metric_value(job_metrics: JobMetrics, name: str) -> Optional[Any]:
136
136
  if metric.name == name:
137
137
  return metric.values[-1]
138
138
  return None
139
+
140
+
141
+ def _format_memory(memory_bytes: int, decimal_places: int) -> str:
142
+ """See test_format_memory in tests/_internal/cli/commands/test_metrics.py for examples."""
143
+ memory_mb = memory_bytes / 1024 / 1024
144
+ if memory_mb >= 1024:
145
+ value = memory_mb / 1024
146
+ unit = "GB"
147
+ else:
148
+ value = memory_mb
149
+ unit = "MB"
150
+
151
+ if decimal_places == 0:
152
+ return f"{round(value)}{unit}"
153
+ return f"{value:.{decimal_places}f}".rstrip("0").rstrip(".") + unit
@@ -0,0 +1,161 @@
1
+ import argparse
2
+
3
+ from requests import HTTPError
4
+ from rich.table import Table
5
+
6
+ import dstack.api.server
7
+ from dstack._internal.cli.commands import BaseCommand
8
+ from dstack._internal.cli.utils.common import confirm_ask, console
9
+ from dstack._internal.core.errors import ClientError, CLIError
10
+ from dstack._internal.core.services.configs import ConfigManager
11
+ from dstack._internal.utils.logging import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ class ProjectCommand(BaseCommand):
17
+ NAME = "project"
18
+ DESCRIPTION = "Manage projects configs"
19
+
20
+ def _register(self):
21
+ super()._register()
22
+ subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
23
+
24
+ # Add subcommand
25
+ add_parser = subparsers.add_parser("add", help="Add or update a project config")
26
+ add_parser.add_argument(
27
+ "--name", type=str, help="The name of the project to configure", required=True
28
+ )
29
+ add_parser.add_argument("--url", type=str, help="Server url", required=True)
30
+ add_parser.add_argument("--token", type=str, help="User token", required=True)
31
+ add_parser.add_argument(
32
+ "-y",
33
+ "--yes",
34
+ help="Don't ask for confirmation (e.g. update the config)",
35
+ action="store_true",
36
+ )
37
+ add_parser.add_argument(
38
+ "-n",
39
+ "--no",
40
+ help="Don't ask for confirmation (e.g. do not update the config)",
41
+ action="store_true",
42
+ )
43
+ add_parser.set_defaults(subfunc=self._add)
44
+
45
+ # Delete subcommand
46
+ delete_parser = subparsers.add_parser("delete", help="Delete a project config")
47
+ delete_parser.add_argument(
48
+ "--name", type=str, help="The name of the project to delete", required=True
49
+ )
50
+ delete_parser.add_argument(
51
+ "-y",
52
+ "--yes",
53
+ help="Don't ask for confirmation",
54
+ action="store_true",
55
+ )
56
+ delete_parser.set_defaults(subfunc=self._delete)
57
+
58
+ # List subcommand
59
+ list_parser = subparsers.add_parser("list", help="List configured projects")
60
+ list_parser.set_defaults(subfunc=self._list)
61
+
62
+ # Set default subcommand
63
+ set_default_parser = subparsers.add_parser("set-default", help="Set default project")
64
+ set_default_parser.add_argument(
65
+ "name", type=str, help="The name of the project to set as default"
66
+ )
67
+ set_default_parser.set_defaults(subfunc=self._set_default)
68
+
69
+ def _command(self, args: argparse.Namespace):
70
+ if not hasattr(args, "subfunc"):
71
+ args.subfunc = self._list
72
+ args.subfunc(args)
73
+
74
+ def _add(self, args: argparse.Namespace):
75
+ config_manager = ConfigManager()
76
+ api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
77
+ try:
78
+ api_client.projects.get(args.name)
79
+ except HTTPError as e:
80
+ if e.response.status_code == 403:
81
+ raise CLIError("Forbidden. Ensure the token is valid.")
82
+ elif e.response.status_code == 404:
83
+ raise CLIError(f"Project '{args.name}' not found.")
84
+ else:
85
+ raise e
86
+ default_project = config_manager.get_project_config()
87
+ if (
88
+ default_project is None
89
+ or default_project.name != args.name
90
+ or default_project.url != args.url
91
+ or default_project.token != args.token
92
+ ):
93
+ set_it_as_default = (
94
+ (
95
+ args.yes
96
+ or not default_project
97
+ or confirm_ask(f"Set '{args.name}' as your default project?")
98
+ )
99
+ if not args.no
100
+ else False
101
+ )
102
+ config_manager.configure_project(
103
+ name=args.name, url=args.url, token=args.token, default=set_it_as_default
104
+ )
105
+ config_manager.save()
106
+ logger.info(
107
+ f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
108
+ )
109
+
110
+ def _delete(self, args: argparse.Namespace):
111
+ config_manager = ConfigManager()
112
+ if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
113
+ config_manager.delete_project(args.name)
114
+ config_manager.save()
115
+ console.print("[grey58]OK[/]")
116
+
117
+ def _list(self, args: argparse.Namespace):
118
+ config_manager = ConfigManager()
119
+ default_project = config_manager.get_project_config()
120
+
121
+ table = Table(box=None)
122
+ table.add_column("PROJECT", style="bold", no_wrap=True)
123
+ table.add_column("URL", style="grey58")
124
+ table.add_column("USER", style="grey58")
125
+ table.add_column("DEFAULT", justify="center")
126
+
127
+ for project_name in config_manager.list_projects():
128
+ project_config = config_manager.get_project_config(project_name)
129
+ is_default = project_name == default_project.name if default_project else False
130
+
131
+ # Get username from API
132
+ try:
133
+ api_client = dstack.api.server.APIClient(
134
+ base_url=project_config.url, token=project_config.token
135
+ )
136
+ user_info = api_client.users.get_my_user()
137
+ username = user_info.username
138
+ except ClientError:
139
+ username = "(invalid token)"
140
+
141
+ table.add_row(
142
+ project_name,
143
+ project_config.url,
144
+ username,
145
+ "✓" if is_default else "",
146
+ style="bold" if is_default else None,
147
+ )
148
+
149
+ console.print(table)
150
+
151
+ def _set_default(self, args: argparse.Namespace):
152
+ config_manager = ConfigManager()
153
+ project_config = config_manager.get_project_config(args.name)
154
+ if project_config is None:
155
+ raise CLIError(f"Project '{args.name}' not found")
156
+
157
+ config_manager.configure_project(
158
+ name=args.name, url=project_config.url, token=project_config.token, default=True
159
+ )
160
+ config_manager.save()
161
+ console.print("[grey58]OK[/]")
@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
36
36
  help="Watch statuses of runs in realtime",
37
37
  action="store_true",
38
38
  )
39
+ self._parser.add_argument(
40
+ "-n",
41
+ "--last",
42
+ help="Show only the last N runs. Implies --all",
43
+ type=int,
44
+ default=None,
45
+ )
39
46
 
40
47
  def _command(self, args: argparse.Namespace):
41
48
  super()._command(args)
42
- runs = self.api.runs.list(all=args.all)
49
+ runs = self.api.runs.list(all=args.all, limit=args.last)
43
50
  if not args.watch:
44
51
  console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
45
52
  return
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
49
56
  while True:
50
57
  live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
51
58
  time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
52
- runs = self.api.runs.list(all=args.all)
59
+ runs = self.api.runs.list(all=args.all, limit=args.last)
53
60
  except KeyboardInterrupt:
54
61
  pass
@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
15
15
  from dstack._internal.cli.commands.logs import LogsCommand
16
16
  from dstack._internal.cli.commands.metrics import MetricsCommand
17
17
  from dstack._internal.cli.commands.offer import OfferCommand
18
+ from dstack._internal.cli.commands.project import ProjectCommand
18
19
  from dstack._internal.cli.commands.ps import PsCommand
19
20
  from dstack._internal.cli.commands.server import ServerCommand
20
21
  from dstack._internal.cli.commands.stats import StatsCommand
@@ -69,6 +70,7 @@ def main():
69
70
  OfferCommand.register(subparsers)
70
71
  LogsCommand.register(subparsers)
71
72
  MetricsCommand.register(subparsers)
73
+ ProjectCommand.register(subparsers)
72
74
  PsCommand.register(subparsers)
73
75
  ServerCommand.register(subparsers)
74
76
  StatsCommand.register(subparsers)
@@ -391,9 +391,9 @@ class VMImageVariant(enum.Enum):
391
391
 
392
392
 
393
393
  _SUPPORTED_VM_SERIES_PATTERNS = [
394
- r"D(\d+)s_v3", # Dsv3-series
395
- r"E(\d+)i?s_v4", # Esv4-series
396
- r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
394
+ r"D(\d+)s_v6", # Dsv6-series (general purpose)
395
+ r"E(\d+)i?s_v6", # Esv6-series (memory optimized)
396
+ r"F(\d+)s_v2", # Fsv2-series (compute optimized)
397
397
  r"NC(\d+)s_v3", # NCv3-series [V100 16GB]
398
398
  r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4]
399
399
  r"ND(\d+)rs_v2", # NDv2-series [8xV100 32GB]
@@ -401,6 +401,11 @@ _SUPPORTED_VM_SERIES_PATTERNS = [
401
401
  r"NC(\d+)ads_A100_v4", # NC A100 v4-series [A100 80GB]
402
402
  r"ND(\d+)asr_v4", # ND A100 v4-series [8xA100 40GB]
403
403
  r"ND(\d+)amsr_A100_v4", # NDm A100 v4-series [8xA100 80GB]
404
+ # Deprecated series
405
+ # TODO: Remove after several releases
406
+ r"D(\d+)s_v3", # Dsv3-series (general purpose)
407
+ r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
408
+ r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
404
409
  ]
405
410
  _SUPPORTED_VM_SERIES_PATTERN = (
406
411
  "^Standard_(" + "|".join(f"({s})" for s in _SUPPORTED_VM_SERIES_PATTERNS) + ")$"
@@ -19,6 +19,7 @@ from dstack._internal.core.consts import (
19
19
  DSTACK_RUNNER_SSH_PORT,
20
20
  DSTACK_SHIM_HTTP_PORT,
21
21
  )
22
+ from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
22
23
  from dstack._internal.core.models.gateways import (
23
24
  GatewayComputeConfiguration,
24
25
  GatewayProvisioningData,
@@ -754,7 +755,7 @@ def get_docker_commands(
754
755
  f" --ssh-port {DSTACK_RUNNER_SSH_PORT}"
755
756
  " --temp-dir /tmp/runner"
756
757
  " --home-dir /root"
757
- " --working-dir /workflow"
758
+ f" --working-dir {DEFAULT_REPO_DIR}"
758
759
  ),
759
760
  ]
760
761
 
@@ -23,6 +23,10 @@ ValidPort = conint(gt=0, le=65536)
23
23
  MAX_INT64 = 2**63 - 1
24
24
  SERVICE_HTTPS_DEFAULT = True
25
25
  STRIP_PREFIX_DEFAULT = True
26
+ RUN_PRIOTIRY_MIN = 0
27
+ RUN_PRIOTIRY_MAX = 100
28
+ RUN_PRIORITY_DEFAULT = 0
29
+ DEFAULT_REPO_DIR = "/workflow"
26
30
 
27
31
 
28
32
  class RunConfigurationType(str, Enum):
@@ -77,7 +81,8 @@ class ScalingSpec(CoreModel):
77
81
  Field(
78
82
  description="The target value of the metric. "
79
83
  "The number of replicas is calculated based on this number and automatically adjusts "
80
- "(scales up or down) as this metric changes"
84
+ "(scales up or down) as this metric changes",
85
+ gt=0,
81
86
  ),
82
87
  ]
83
88
  scale_up_delay: Annotated[
@@ -177,7 +182,7 @@ class BaseRunConfiguration(CoreModel):
177
182
  Field(
178
183
  description=(
179
184
  "The path to the working directory inside the container."
180
- " It's specified relative to the repository directory (`/workflow`) and should be inside it."
185
+ f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
181
186
  ' Defaults to `"."` '
182
187
  )
183
188
  ),
@@ -221,14 +226,26 @@ class BaseRunConfiguration(CoreModel):
221
226
  )
222
227
  ),
223
228
  ] = None
224
- # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
225
- setup: CommandsList = []
226
229
  resources: Annotated[
227
230
  ResourcesSpec, Field(description="The resources requirements to run the configuration")
228
231
  ] = ResourcesSpec()
232
+ priority: Annotated[
233
+ Optional[int],
234
+ Field(
235
+ ge=RUN_PRIOTIRY_MIN,
236
+ le=RUN_PRIOTIRY_MAX,
237
+ description=(
238
+ f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
239
+ " `dstack` tries to provision runs with higher priority first."
240
+ f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
241
+ ),
242
+ ),
243
+ ] = None
229
244
  volumes: Annotated[
230
245
  List[Union[MountPoint, str]], Field(description="The volumes mount points")
231
246
  ] = []
247
+ # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
248
+ setup: CommandsList = []
232
249
 
233
250
  @validator("python", pre=True, always=True)
234
251
  def convert_python(cls, v, values) -> Optional[PythonVersion]:
@@ -8,6 +8,7 @@ from typing_extensions import Annotated
8
8
  from dstack._internal.core.models.backends.base import BackendType
9
9
  from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
10
10
  from dstack._internal.core.models.configurations import (
11
+ DEFAULT_REPO_DIR,
11
12
  AnyRunConfiguration,
12
13
  RunConfiguration,
13
14
  )
@@ -338,7 +339,7 @@ class RunSpec(CoreModel):
338
339
  Field(
339
340
  description=(
340
341
  "The path to the working directory inside the container."
341
- " It's specified relative to the repository directory (`/workflow`) and should be inside it."
342
+ f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
342
343
  ' Defaults to `"."`.'
343
344
  )
344
345
  ),
@@ -1 +1,11 @@
1
- log_format dstack_stat '$time_iso8601 $host $status $request_time';
1
+ log_format dstack_stat '$time_iso8601 $host $status $request_time $dstack_replica_hit';
2
+
3
+
4
+ # A hack to avoid this Nginx reload error when no services are registered:
5
+ # nginx: [emerg] unknown "dstack_replica_hit" variable
6
+ server {
7
+ listen unix:/tmp/dstack-dummy-nginx.sock;
8
+ server_name placeholder.local;
9
+ deny all;
10
+ set $dstack_replica_hit 0;
11
+ }
@@ -14,6 +14,7 @@ upstream {{ domain }}.upstream {
14
14
  server {
15
15
  server_name {{ domain }};
16
16
  limit_req_status 429;
17
+ set $dstack_replica_hit 0;
17
18
  access_log {{ access_log_path }} dstack_stat;
18
19
  client_max_body_size {{ client_max_body_size }};
19
20
 
@@ -23,11 +24,7 @@ server {
23
24
  auth_request /_dstack_auth;
24
25
  {% endif %}
25
26
 
26
- {% if replicas %}
27
27
  try_files /nonexistent @$http_upgrade;
28
- {% else %}
29
- return 503;
30
- {% endif %}
31
28
 
32
29
  {% if location.limit_req %}
33
30
  limit_req zone={{ location.limit_req.zone }}{% if location.limit_req.burst %} burst={{ location.limit_req.burst }} nodelay{% endif %};
@@ -35,8 +32,9 @@ server {
35
32
  }
36
33
  {% endfor %}
37
34
 
38
- {% if replicas %}
39
35
  location @websocket {
36
+ set $dstack_replica_hit 1;
37
+ {% if replicas %}
40
38
  proxy_pass http://{{ domain }}.upstream;
41
39
  proxy_set_header X-Real-IP $remote_addr;
42
40
  proxy_set_header Host $host;
@@ -44,19 +42,27 @@ server {
44
42
  proxy_set_header Upgrade $http_upgrade;
45
43
  proxy_set_header Connection "Upgrade";
46
44
  proxy_read_timeout 300s;
45
+ {% else %}
46
+ return 503;
47
+ {% endif %}
47
48
  }
48
49
  location @ {
50
+ set $dstack_replica_hit 1;
51
+ {% if replicas %}
49
52
  proxy_pass http://{{ domain }}.upstream;
50
53
  proxy_set_header X-Real-IP $remote_addr;
51
54
  proxy_set_header Host $host;
52
55
  proxy_read_timeout 300s;
56
+ {% else %}
57
+ return 503;
58
+ {% endif %}
53
59
  }
54
- {% endif %}
55
60
 
56
61
  {% if auth %}
57
62
  location = /_dstack_auth {
58
63
  internal;
59
64
  if ($remote_addr = 127.0.0.1) {
65
+ # for requests from the gateway app, e.g. from the OpenAI-compatible API
60
66
  return 200;
61
67
  }
62
68
  proxy_pass http://localhost:{{ proxy_port }}/api/auth/{{ project_name }};
@@ -11,10 +11,10 @@ from pydantic import BaseModel
11
11
 
12
12
  from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo
13
13
  from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, ServiceStats, Stat
14
+ from dstack._internal.proxy.lib.errors import UnexpectedProxyError
14
15
  from dstack._internal.utils.common import run_async
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
- IGNORE_STATUSES = {403, 404}
18
18
  WINDOWS = (30, 60, 300)
19
19
  TTL = WINDOWS[-1]
20
20
  EMPTY_STATS = {window: Stat(requests=0, request_time=0.0) for window in WINDOWS}
@@ -35,6 +35,7 @@ class LogEntry(BaseModel):
35
35
  host: str
36
36
  status: int
37
37
  request_time: float
38
+ is_replica_hit: bool
38
39
 
39
40
 
40
41
  class StatsCollector:
@@ -87,7 +88,8 @@ class StatsCollector:
87
88
  now = datetime.datetime.now(tz=datetime.timezone.utc)
88
89
 
89
90
  for entry in self._read_access_log(now - datetime.timedelta(seconds=TTL)):
90
- if entry.status in IGNORE_STATUSES:
91
+ # only include requests that hit or should hit a service replica
92
+ if not entry.is_replica_hit:
91
93
  continue
92
94
 
93
95
  frame_timestamp = int(entry.timestamp.timestamp())
@@ -119,7 +121,10 @@ class StatsCollector:
119
121
  line = self._file.readline()
120
122
  if not line:
121
123
  break
122
- timestamp_str, host, status, request_time = line.split()
124
+ cells = line.split()
125
+ if len(cells) == 4: # compatibility with pre-0.19.11 logs
126
+ cells.append("0" if cells[2] in ["403", "404"] else "1")
127
+ timestamp_str, host, status, request_time, dstack_replica_hit = cells
123
128
  timestamp = datetime.datetime.fromisoformat(timestamp_str)
124
129
  if timestamp < after:
125
130
  continue
@@ -128,6 +133,7 @@ class StatsCollector:
128
133
  host=host,
129
134
  status=int(status),
130
135
  request_time=float(request_time),
136
+ is_replica_hit=_parse_nginx_bool(dstack_replica_hit),
131
137
  )
132
138
  if os.fstat(self._file.fileno()).st_ino != st_ino:
133
139
  # file was rotated
@@ -154,3 +160,11 @@ async def get_service_stats(
154
160
  )
155
161
  for service in services
156
162
  ]
163
+
164
+
165
+ def _parse_nginx_bool(v: str) -> bool:
166
+ if v == "0":
167
+ return False
168
+ if v == "1":
169
+ return True
170
+ raise UnexpectedProxyError(f"Cannot parse boolean value: expected '0' or '1', got {v!r}")
@@ -2,7 +2,7 @@ import asyncio
2
2
  import json
3
3
  from typing import Dict, List, Optional
4
4
 
5
- from sqlalchemy import delete, select
5
+ from sqlalchemy import Delete, delete, select
6
6
  from sqlalchemy.orm import joinedload
7
7
 
8
8
  from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
@@ -49,27 +49,29 @@ async def delete_metrics():
49
49
  finished_timestamp_micro_cutoff = (
50
50
  now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
51
51
  )
52
+ await asyncio.gather(
53
+ _execute_delete_statement(
54
+ delete(JobMetricsPoint).where(
55
+ JobMetricsPoint.job_id.in_(
56
+ select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
57
+ ),
58
+ JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
59
+ )
60
+ ),
61
+ _execute_delete_statement(
62
+ delete(JobMetricsPoint).where(
63
+ JobMetricsPoint.job_id.in_(
64
+ select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
65
+ ),
66
+ JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
67
+ )
68
+ ),
69
+ )
70
+
71
+
72
+ async def _execute_delete_statement(stmt: Delete) -> None:
52
73
  async with get_session_ctx() as session:
53
- await asyncio.gather(
54
- session.execute(
55
- delete(JobMetricsPoint).where(
56
- JobMetricsPoint.job_id.in_(
57
- select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
58
- ),
59
- JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
60
- )
61
- ),
62
- session.execute(
63
- delete(JobMetricsPoint).where(
64
- JobMetricsPoint.job_id.in_(
65
- select(JobModel.id).where(
66
- JobModel.status.in_(JobStatus.finished_statuses())
67
- )
68
- ),
69
- JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
70
- )
71
- ),
72
- )
74
+ await session.execute(stmt)
73
75
  await session.commit()
74
76
 
75
77
 
@@ -93,11 +93,20 @@ async def _process_next_submitted_job():
93
93
  async with lock:
94
94
  res = await session.execute(
95
95
  select(JobModel)
96
+ .join(JobModel.run)
96
97
  .where(
97
98
  JobModel.status == JobStatus.SUBMITTED,
98
99
  JobModel.id.not_in(lockset),
99
100
  )
100
- .order_by(JobModel.last_processed_at.asc())
101
+ # Jobs are process in FIFO sorted by priority globally,
102
+ # thus runs from different project can "overtake" each other by using higher priorities.
103
+ # That's not a big problem as long as projects do not compete for the same compute resources.
104
+ # Jobs with lower priorities from other projects will be processed without major lag
105
+ # as long as new higher priority runs are not constantly submitted.
106
+ # TODO: Consider processing jobs from different projects fairly/round-robin
107
+ # Fully fair processing can be tricky to implement via the current DB queue as
108
+ # there can be many projects and we are limited by the max DB connections.
109
+ .order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
101
110
  .limit(1)
102
111
  .with_for_update(skip_locked=True)
103
112
  )
@@ -360,16 +369,16 @@ async def _assign_job_to_pool_instance(
360
369
  (instance, common_utils.get_or_error(get_instance_offer(instance)))
361
370
  for instance in nonshared_instances
362
371
  ]
363
- if not multinode:
364
- shared_instances_with_offers = get_shared_pool_instances_with_offers(
365
- pool_instances=pool_instances,
366
- profile=profile,
367
- requirements=job.job_spec.requirements,
368
- idle_only=True,
369
- fleet_model=fleet_model,
370
- volumes=volumes,
371
- )
372
- instances_with_offers.extend(shared_instances_with_offers)
372
+ shared_instances_with_offers = get_shared_pool_instances_with_offers(
373
+ pool_instances=pool_instances,
374
+ profile=profile,
375
+ requirements=job.job_spec.requirements,
376
+ idle_only=True,
377
+ fleet_model=fleet_model,
378
+ multinode=multinode,
379
+ volumes=volumes,
380
+ )
381
+ instances_with_offers.extend(shared_instances_with_offers)
373
382
 
374
383
  if len(instances_with_offers) == 0:
375
384
  return None
@@ -572,7 +581,7 @@ def _create_instance_model_for_job(
572
581
 
573
582
 
574
583
  def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
575
- if offer.total_blocks == 1:
584
+ if offer.blocks == offer.total_blocks:
576
585
  if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
577
586
  network_mode = NetworkMode.BRIDGE
578
587
  else: