dstack 0.19.9__py3-none-any.whl → 0.19.11rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/config.py +1 -1
- dstack/_internal/cli/commands/metrics.py +25 -10
- dstack/_internal/cli/commands/project.py +161 -0
- dstack/_internal/cli/commands/ps.py +9 -2
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/core/backends/azure/compute.py +8 -3
- dstack/_internal/core/backends/base/compute.py +2 -1
- dstack/_internal/core/models/configurations.py +21 -4
- dstack/_internal/core/models/runs.py +2 -1
- dstack/_internal/proxy/gateway/resources/nginx/00-log-format.conf +11 -1
- dstack/_internal/proxy/gateway/resources/nginx/service.jinja2 +12 -6
- dstack/_internal/proxy/gateway/services/stats.py +17 -3
- dstack/_internal/server/background/tasks/process_metrics.py +23 -21
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/repos.py +8 -4
- dstack/_internal/server/services/instances.py +6 -2
- dstack/_internal/server/services/jobs/configurators/base.py +18 -4
- dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -1
- dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -1
- dstack/_internal/server/services/runs.py +31 -18
- dstack/_internal/server/settings.py +1 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/routers.py +3 -6
- dstack/_internal/settings.py +4 -0
- dstack/api/_public/runs.py +6 -3
- dstack/api/server/_runs.py +2 -0
- dstack/version.py +2 -2
- {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/METADATA +11 -6
- {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/RECORD +37 -35
- {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/WHEEL +0 -0
- {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.9.dist-info → dstack-0.19.11rc1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -39,8 +39,6 @@ class MetricsCommand(APIBaseCommand):
|
|
|
39
39
|
run = self.api.runs.get(run_name=args.run_name)
|
|
40
40
|
if run is None:
|
|
41
41
|
raise CLIError(f"Run {args.run_name} not found")
|
|
42
|
-
if run.status.is_finished():
|
|
43
|
-
raise CLIError(f"Run {args.run_name} is finished")
|
|
44
42
|
metrics = _get_run_jobs_metrics(api=self.api, run=run)
|
|
45
43
|
|
|
46
44
|
if not args.watch:
|
|
@@ -55,8 +53,6 @@ class MetricsCommand(APIBaseCommand):
|
|
|
55
53
|
run = self.api.runs.get(run_name=args.run_name)
|
|
56
54
|
if run is None:
|
|
57
55
|
raise CLIError(f"Run {args.run_name} not found")
|
|
58
|
-
if run.status.is_finished():
|
|
59
|
-
raise CLIError(f"Run {args.run_name} is finished")
|
|
60
56
|
metrics = _get_run_jobs_metrics(api=self.api, run=run)
|
|
61
57
|
except KeyboardInterrupt:
|
|
62
58
|
pass
|
|
@@ -78,11 +74,12 @@ def _get_run_jobs_metrics(api: Client, run: Run) -> List[JobMetrics]:
|
|
|
78
74
|
def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
|
|
79
75
|
table = Table(box=None)
|
|
80
76
|
table.add_column("NAME", style="bold", no_wrap=True)
|
|
77
|
+
table.add_column("STATUS")
|
|
81
78
|
table.add_column("CPU")
|
|
82
79
|
table.add_column("MEMORY")
|
|
83
80
|
table.add_column("GPU")
|
|
84
81
|
|
|
85
|
-
run_row: Dict[Union[str, int], Any] = {"NAME": run.name}
|
|
82
|
+
run_row: Dict[Union[str, int], Any] = {"NAME": run.name, "STATUS": run.status.value}
|
|
86
83
|
if len(run._run.jobs) != 1:
|
|
87
84
|
add_row_from_dict(table, run_row)
|
|
88
85
|
|
|
@@ -101,9 +98,9 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
|
|
|
101
98
|
cpu_usage = f"{cpu_usage:.0f}%"
|
|
102
99
|
memory_usage = _get_metric_value(job_metrics, "memory_working_set_bytes")
|
|
103
100
|
if memory_usage is not None:
|
|
104
|
-
memory_usage =
|
|
101
|
+
memory_usage = _format_memory(memory_usage, 2)
|
|
105
102
|
if resources is not None:
|
|
106
|
-
memory_usage += f"/{resources.memory_mib}
|
|
103
|
+
memory_usage += f"/{_format_memory(resources.memory_mib * 1024 * 1024, 2)}"
|
|
107
104
|
gpu_metrics = ""
|
|
108
105
|
gpus_detected_num = _get_metric_value(job_metrics, "gpus_detected_num")
|
|
109
106
|
if gpus_detected_num is not None:
|
|
@@ -113,13 +110,16 @@ def _get_metrics_table(run: Run, metrics: List[JobMetrics]) -> Table:
|
|
|
113
110
|
if gpu_memory_usage is not None:
|
|
114
111
|
if i != 0:
|
|
115
112
|
gpu_metrics += "\n"
|
|
116
|
-
gpu_metrics += f"
|
|
113
|
+
gpu_metrics += f"gpu={i} mem={_format_memory(gpu_memory_usage, 2)}"
|
|
117
114
|
if resources is not None:
|
|
118
|
-
gpu_metrics +=
|
|
119
|
-
|
|
115
|
+
gpu_metrics += (
|
|
116
|
+
f"/{_format_memory(resources.gpus[i].memory_mib * 1024 * 1024, 2)}"
|
|
117
|
+
)
|
|
118
|
+
gpu_metrics += f" util={gpu_util_percent}%"
|
|
120
119
|
|
|
121
120
|
job_row: Dict[Union[str, int], Any] = {
|
|
122
121
|
"NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
|
|
122
|
+
"STATUS": job.job_submissions[-1].status.value,
|
|
123
123
|
"CPU": cpu_usage or "-",
|
|
124
124
|
"MEMORY": memory_usage or "-",
|
|
125
125
|
"GPU": gpu_metrics or "-",
|
|
@@ -136,3 +136,18 @@ def _get_metric_value(job_metrics: JobMetrics, name: str) -> Optional[Any]:
|
|
|
136
136
|
if metric.name == name:
|
|
137
137
|
return metric.values[-1]
|
|
138
138
|
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _format_memory(memory_bytes: int, decimal_places: int) -> str:
|
|
142
|
+
"""See test_format_memory in tests/_internal/cli/commands/test_metrics.py for examples."""
|
|
143
|
+
memory_mb = memory_bytes / 1024 / 1024
|
|
144
|
+
if memory_mb >= 1024:
|
|
145
|
+
value = memory_mb / 1024
|
|
146
|
+
unit = "GB"
|
|
147
|
+
else:
|
|
148
|
+
value = memory_mb
|
|
149
|
+
unit = "MB"
|
|
150
|
+
|
|
151
|
+
if decimal_places == 0:
|
|
152
|
+
return f"{round(value)}{unit}"
|
|
153
|
+
return f"{value:.{decimal_places}f}".rstrip("0").rstrip(".") + unit
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from requests import HTTPError
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
|
|
6
|
+
import dstack.api.server
|
|
7
|
+
from dstack._internal.cli.commands import BaseCommand
|
|
8
|
+
from dstack._internal.cli.utils.common import confirm_ask, console
|
|
9
|
+
from dstack._internal.core.errors import ClientError, CLIError
|
|
10
|
+
from dstack._internal.core.services.configs import ConfigManager
|
|
11
|
+
from dstack._internal.utils.logging import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ProjectCommand(BaseCommand):
|
|
17
|
+
NAME = "project"
|
|
18
|
+
DESCRIPTION = "Manage projects configs"
|
|
19
|
+
|
|
20
|
+
def _register(self):
|
|
21
|
+
super()._register()
|
|
22
|
+
subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
|
|
23
|
+
|
|
24
|
+
# Add subcommand
|
|
25
|
+
add_parser = subparsers.add_parser("add", help="Add or update a project config")
|
|
26
|
+
add_parser.add_argument(
|
|
27
|
+
"--name", type=str, help="The name of the project to configure", required=True
|
|
28
|
+
)
|
|
29
|
+
add_parser.add_argument("--url", type=str, help="Server url", required=True)
|
|
30
|
+
add_parser.add_argument("--token", type=str, help="User token", required=True)
|
|
31
|
+
add_parser.add_argument(
|
|
32
|
+
"-y",
|
|
33
|
+
"--yes",
|
|
34
|
+
help="Don't ask for confirmation (e.g. update the config)",
|
|
35
|
+
action="store_true",
|
|
36
|
+
)
|
|
37
|
+
add_parser.add_argument(
|
|
38
|
+
"-n",
|
|
39
|
+
"--no",
|
|
40
|
+
help="Don't ask for confirmation (e.g. do not update the config)",
|
|
41
|
+
action="store_true",
|
|
42
|
+
)
|
|
43
|
+
add_parser.set_defaults(subfunc=self._add)
|
|
44
|
+
|
|
45
|
+
# Delete subcommand
|
|
46
|
+
delete_parser = subparsers.add_parser("delete", help="Delete a project config")
|
|
47
|
+
delete_parser.add_argument(
|
|
48
|
+
"--name", type=str, help="The name of the project to delete", required=True
|
|
49
|
+
)
|
|
50
|
+
delete_parser.add_argument(
|
|
51
|
+
"-y",
|
|
52
|
+
"--yes",
|
|
53
|
+
help="Don't ask for confirmation",
|
|
54
|
+
action="store_true",
|
|
55
|
+
)
|
|
56
|
+
delete_parser.set_defaults(subfunc=self._delete)
|
|
57
|
+
|
|
58
|
+
# List subcommand
|
|
59
|
+
list_parser = subparsers.add_parser("list", help="List configured projects")
|
|
60
|
+
list_parser.set_defaults(subfunc=self._list)
|
|
61
|
+
|
|
62
|
+
# Set default subcommand
|
|
63
|
+
set_default_parser = subparsers.add_parser("set-default", help="Set default project")
|
|
64
|
+
set_default_parser.add_argument(
|
|
65
|
+
"name", type=str, help="The name of the project to set as default"
|
|
66
|
+
)
|
|
67
|
+
set_default_parser.set_defaults(subfunc=self._set_default)
|
|
68
|
+
|
|
69
|
+
def _command(self, args: argparse.Namespace):
|
|
70
|
+
if not hasattr(args, "subfunc"):
|
|
71
|
+
args.subfunc = self._list
|
|
72
|
+
args.subfunc(args)
|
|
73
|
+
|
|
74
|
+
def _add(self, args: argparse.Namespace):
|
|
75
|
+
config_manager = ConfigManager()
|
|
76
|
+
api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
|
|
77
|
+
try:
|
|
78
|
+
api_client.projects.get(args.name)
|
|
79
|
+
except HTTPError as e:
|
|
80
|
+
if e.response.status_code == 403:
|
|
81
|
+
raise CLIError("Forbidden. Ensure the token is valid.")
|
|
82
|
+
elif e.response.status_code == 404:
|
|
83
|
+
raise CLIError(f"Project '{args.name}' not found.")
|
|
84
|
+
else:
|
|
85
|
+
raise e
|
|
86
|
+
default_project = config_manager.get_project_config()
|
|
87
|
+
if (
|
|
88
|
+
default_project is None
|
|
89
|
+
or default_project.name != args.name
|
|
90
|
+
or default_project.url != args.url
|
|
91
|
+
or default_project.token != args.token
|
|
92
|
+
):
|
|
93
|
+
set_it_as_default = (
|
|
94
|
+
(
|
|
95
|
+
args.yes
|
|
96
|
+
or not default_project
|
|
97
|
+
or confirm_ask(f"Set '{args.name}' as your default project?")
|
|
98
|
+
)
|
|
99
|
+
if not args.no
|
|
100
|
+
else False
|
|
101
|
+
)
|
|
102
|
+
config_manager.configure_project(
|
|
103
|
+
name=args.name, url=args.url, token=args.token, default=set_it_as_default
|
|
104
|
+
)
|
|
105
|
+
config_manager.save()
|
|
106
|
+
logger.info(
|
|
107
|
+
f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _delete(self, args: argparse.Namespace):
|
|
111
|
+
config_manager = ConfigManager()
|
|
112
|
+
if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
|
|
113
|
+
config_manager.delete_project(args.name)
|
|
114
|
+
config_manager.save()
|
|
115
|
+
console.print("[grey58]OK[/]")
|
|
116
|
+
|
|
117
|
+
def _list(self, args: argparse.Namespace):
|
|
118
|
+
config_manager = ConfigManager()
|
|
119
|
+
default_project = config_manager.get_project_config()
|
|
120
|
+
|
|
121
|
+
table = Table(box=None)
|
|
122
|
+
table.add_column("PROJECT", style="bold", no_wrap=True)
|
|
123
|
+
table.add_column("URL", style="grey58")
|
|
124
|
+
table.add_column("USER", style="grey58")
|
|
125
|
+
table.add_column("DEFAULT", justify="center")
|
|
126
|
+
|
|
127
|
+
for project_name in config_manager.list_projects():
|
|
128
|
+
project_config = config_manager.get_project_config(project_name)
|
|
129
|
+
is_default = project_name == default_project.name if default_project else False
|
|
130
|
+
|
|
131
|
+
# Get username from API
|
|
132
|
+
try:
|
|
133
|
+
api_client = dstack.api.server.APIClient(
|
|
134
|
+
base_url=project_config.url, token=project_config.token
|
|
135
|
+
)
|
|
136
|
+
user_info = api_client.users.get_my_user()
|
|
137
|
+
username = user_info.username
|
|
138
|
+
except ClientError:
|
|
139
|
+
username = "(invalid token)"
|
|
140
|
+
|
|
141
|
+
table.add_row(
|
|
142
|
+
project_name,
|
|
143
|
+
project_config.url,
|
|
144
|
+
username,
|
|
145
|
+
"✓" if is_default else "",
|
|
146
|
+
style="bold" if is_default else None,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
console.print(table)
|
|
150
|
+
|
|
151
|
+
def _set_default(self, args: argparse.Namespace):
|
|
152
|
+
config_manager = ConfigManager()
|
|
153
|
+
project_config = config_manager.get_project_config(args.name)
|
|
154
|
+
if project_config is None:
|
|
155
|
+
raise CLIError(f"Project '{args.name}' not found")
|
|
156
|
+
|
|
157
|
+
config_manager.configure_project(
|
|
158
|
+
name=args.name, url=project_config.url, token=project_config.token, default=True
|
|
159
|
+
)
|
|
160
|
+
config_manager.save()
|
|
161
|
+
console.print("[grey58]OK[/]")
|
|
@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
|
|
|
36
36
|
help="Watch statuses of runs in realtime",
|
|
37
37
|
action="store_true",
|
|
38
38
|
)
|
|
39
|
+
self._parser.add_argument(
|
|
40
|
+
"-n",
|
|
41
|
+
"--last",
|
|
42
|
+
help="Show only the last N runs. Implies --all",
|
|
43
|
+
type=int,
|
|
44
|
+
default=None,
|
|
45
|
+
)
|
|
39
46
|
|
|
40
47
|
def _command(self, args: argparse.Namespace):
|
|
41
48
|
super()._command(args)
|
|
42
|
-
runs = self.api.runs.list(all=args.all)
|
|
49
|
+
runs = self.api.runs.list(all=args.all, limit=args.last)
|
|
43
50
|
if not args.watch:
|
|
44
51
|
console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
|
|
45
52
|
return
|
|
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
|
|
|
49
56
|
while True:
|
|
50
57
|
live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
|
|
51
58
|
time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
|
|
52
|
-
runs = self.api.runs.list(all=args.all)
|
|
59
|
+
runs = self.api.runs.list(all=args.all, limit=args.last)
|
|
53
60
|
except KeyboardInterrupt:
|
|
54
61
|
pass
|
dstack/_internal/cli/main.py
CHANGED
|
@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
|
|
|
15
15
|
from dstack._internal.cli.commands.logs import LogsCommand
|
|
16
16
|
from dstack._internal.cli.commands.metrics import MetricsCommand
|
|
17
17
|
from dstack._internal.cli.commands.offer import OfferCommand
|
|
18
|
+
from dstack._internal.cli.commands.project import ProjectCommand
|
|
18
19
|
from dstack._internal.cli.commands.ps import PsCommand
|
|
19
20
|
from dstack._internal.cli.commands.server import ServerCommand
|
|
20
21
|
from dstack._internal.cli.commands.stats import StatsCommand
|
|
@@ -69,6 +70,7 @@ def main():
|
|
|
69
70
|
OfferCommand.register(subparsers)
|
|
70
71
|
LogsCommand.register(subparsers)
|
|
71
72
|
MetricsCommand.register(subparsers)
|
|
73
|
+
ProjectCommand.register(subparsers)
|
|
72
74
|
PsCommand.register(subparsers)
|
|
73
75
|
ServerCommand.register(subparsers)
|
|
74
76
|
StatsCommand.register(subparsers)
|
|
@@ -391,9 +391,9 @@ class VMImageVariant(enum.Enum):
|
|
|
391
391
|
|
|
392
392
|
|
|
393
393
|
_SUPPORTED_VM_SERIES_PATTERNS = [
|
|
394
|
-
r"D(\d+)
|
|
395
|
-
r"E(\d+)i?
|
|
396
|
-
r"
|
|
394
|
+
r"D(\d+)s_v6", # Dsv6-series (general purpose)
|
|
395
|
+
r"E(\d+)i?s_v6", # Esv6-series (memory optimized)
|
|
396
|
+
r"F(\d+)s_v2", # Fsv2-series (compute optimized)
|
|
397
397
|
r"NC(\d+)s_v3", # NCv3-series [V100 16GB]
|
|
398
398
|
r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4]
|
|
399
399
|
r"ND(\d+)rs_v2", # NDv2-series [8xV100 32GB]
|
|
@@ -401,6 +401,11 @@ _SUPPORTED_VM_SERIES_PATTERNS = [
|
|
|
401
401
|
r"NC(\d+)ads_A100_v4", # NC A100 v4-series [A100 80GB]
|
|
402
402
|
r"ND(\d+)asr_v4", # ND A100 v4-series [8xA100 40GB]
|
|
403
403
|
r"ND(\d+)amsr_A100_v4", # NDm A100 v4-series [8xA100 80GB]
|
|
404
|
+
# Deprecated series
|
|
405
|
+
# TODO: Remove after several releases
|
|
406
|
+
r"D(\d+)s_v3", # Dsv3-series (general purpose)
|
|
407
|
+
r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
|
|
408
|
+
r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
|
|
404
409
|
]
|
|
405
410
|
_SUPPORTED_VM_SERIES_PATTERN = (
|
|
406
411
|
"^Standard_(" + "|".join(f"({s})" for s in _SUPPORTED_VM_SERIES_PATTERNS) + ")$"
|
|
@@ -19,6 +19,7 @@ from dstack._internal.core.consts import (
|
|
|
19
19
|
DSTACK_RUNNER_SSH_PORT,
|
|
20
20
|
DSTACK_SHIM_HTTP_PORT,
|
|
21
21
|
)
|
|
22
|
+
from dstack._internal.core.models.configurations import DEFAULT_REPO_DIR
|
|
22
23
|
from dstack._internal.core.models.gateways import (
|
|
23
24
|
GatewayComputeConfiguration,
|
|
24
25
|
GatewayProvisioningData,
|
|
@@ -754,7 +755,7 @@ def get_docker_commands(
|
|
|
754
755
|
f" --ssh-port {DSTACK_RUNNER_SSH_PORT}"
|
|
755
756
|
" --temp-dir /tmp/runner"
|
|
756
757
|
" --home-dir /root"
|
|
757
|
-
" --working-dir
|
|
758
|
+
f" --working-dir {DEFAULT_REPO_DIR}"
|
|
758
759
|
),
|
|
759
760
|
]
|
|
760
761
|
|
|
@@ -23,6 +23,10 @@ ValidPort = conint(gt=0, le=65536)
|
|
|
23
23
|
MAX_INT64 = 2**63 - 1
|
|
24
24
|
SERVICE_HTTPS_DEFAULT = True
|
|
25
25
|
STRIP_PREFIX_DEFAULT = True
|
|
26
|
+
RUN_PRIOTIRY_MIN = 0
|
|
27
|
+
RUN_PRIOTIRY_MAX = 100
|
|
28
|
+
RUN_PRIORITY_DEFAULT = 0
|
|
29
|
+
DEFAULT_REPO_DIR = "/workflow"
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
class RunConfigurationType(str, Enum):
|
|
@@ -77,7 +81,8 @@ class ScalingSpec(CoreModel):
|
|
|
77
81
|
Field(
|
|
78
82
|
description="The target value of the metric. "
|
|
79
83
|
"The number of replicas is calculated based on this number and automatically adjusts "
|
|
80
|
-
"(scales up or down) as this metric changes"
|
|
84
|
+
"(scales up or down) as this metric changes",
|
|
85
|
+
gt=0,
|
|
81
86
|
),
|
|
82
87
|
]
|
|
83
88
|
scale_up_delay: Annotated[
|
|
@@ -177,7 +182,7 @@ class BaseRunConfiguration(CoreModel):
|
|
|
177
182
|
Field(
|
|
178
183
|
description=(
|
|
179
184
|
"The path to the working directory inside the container."
|
|
180
|
-
" It's specified relative to the repository directory (
|
|
185
|
+
f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
|
|
181
186
|
' Defaults to `"."` '
|
|
182
187
|
)
|
|
183
188
|
),
|
|
@@ -221,14 +226,26 @@ class BaseRunConfiguration(CoreModel):
|
|
|
221
226
|
)
|
|
222
227
|
),
|
|
223
228
|
] = None
|
|
224
|
-
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
|
|
225
|
-
setup: CommandsList = []
|
|
226
229
|
resources: Annotated[
|
|
227
230
|
ResourcesSpec, Field(description="The resources requirements to run the configuration")
|
|
228
231
|
] = ResourcesSpec()
|
|
232
|
+
priority: Annotated[
|
|
233
|
+
Optional[int],
|
|
234
|
+
Field(
|
|
235
|
+
ge=RUN_PRIOTIRY_MIN,
|
|
236
|
+
le=RUN_PRIOTIRY_MAX,
|
|
237
|
+
description=(
|
|
238
|
+
f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
|
|
239
|
+
" `dstack` tries to provision runs with higher priority first."
|
|
240
|
+
f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
|
|
241
|
+
),
|
|
242
|
+
),
|
|
243
|
+
] = None
|
|
229
244
|
volumes: Annotated[
|
|
230
245
|
List[Union[MountPoint, str]], Field(description="The volumes mount points")
|
|
231
246
|
] = []
|
|
247
|
+
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
|
|
248
|
+
setup: CommandsList = []
|
|
232
249
|
|
|
233
250
|
@validator("python", pre=True, always=True)
|
|
234
251
|
def convert_python(cls, v, values) -> Optional[PythonVersion]:
|
|
@@ -8,6 +8,7 @@ from typing_extensions import Annotated
|
|
|
8
8
|
from dstack._internal.core.models.backends.base import BackendType
|
|
9
9
|
from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
|
|
10
10
|
from dstack._internal.core.models.configurations import (
|
|
11
|
+
DEFAULT_REPO_DIR,
|
|
11
12
|
AnyRunConfiguration,
|
|
12
13
|
RunConfiguration,
|
|
13
14
|
)
|
|
@@ -338,7 +339,7 @@ class RunSpec(CoreModel):
|
|
|
338
339
|
Field(
|
|
339
340
|
description=(
|
|
340
341
|
"The path to the working directory inside the container."
|
|
341
|
-
" It's specified relative to the repository directory (
|
|
342
|
+
f" It's specified relative to the repository directory (`{DEFAULT_REPO_DIR}`) and should be inside it."
|
|
342
343
|
' Defaults to `"."`.'
|
|
343
344
|
)
|
|
344
345
|
),
|
|
@@ -1 +1,11 @@
|
|
|
1
|
-
log_format dstack_stat '$time_iso8601 $host $status $request_time';
|
|
1
|
+
log_format dstack_stat '$time_iso8601 $host $status $request_time $dstack_replica_hit';
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# A hack to avoid this Nginx reload error when no services are registered:
|
|
5
|
+
# nginx: [emerg] unknown "dstack_replica_hit" variable
|
|
6
|
+
server {
|
|
7
|
+
listen unix:/tmp/dstack-dummy-nginx.sock;
|
|
8
|
+
server_name placeholder.local;
|
|
9
|
+
deny all;
|
|
10
|
+
set $dstack_replica_hit 0;
|
|
11
|
+
}
|
|
@@ -14,6 +14,7 @@ upstream {{ domain }}.upstream {
|
|
|
14
14
|
server {
|
|
15
15
|
server_name {{ domain }};
|
|
16
16
|
limit_req_status 429;
|
|
17
|
+
set $dstack_replica_hit 0;
|
|
17
18
|
access_log {{ access_log_path }} dstack_stat;
|
|
18
19
|
client_max_body_size {{ client_max_body_size }};
|
|
19
20
|
|
|
@@ -23,11 +24,7 @@ server {
|
|
|
23
24
|
auth_request /_dstack_auth;
|
|
24
25
|
{% endif %}
|
|
25
26
|
|
|
26
|
-
{% if replicas %}
|
|
27
27
|
try_files /nonexistent @$http_upgrade;
|
|
28
|
-
{% else %}
|
|
29
|
-
return 503;
|
|
30
|
-
{% endif %}
|
|
31
28
|
|
|
32
29
|
{% if location.limit_req %}
|
|
33
30
|
limit_req zone={{ location.limit_req.zone }}{% if location.limit_req.burst %} burst={{ location.limit_req.burst }} nodelay{% endif %};
|
|
@@ -35,8 +32,9 @@ server {
|
|
|
35
32
|
}
|
|
36
33
|
{% endfor %}
|
|
37
34
|
|
|
38
|
-
{% if replicas %}
|
|
39
35
|
location @websocket {
|
|
36
|
+
set $dstack_replica_hit 1;
|
|
37
|
+
{% if replicas %}
|
|
40
38
|
proxy_pass http://{{ domain }}.upstream;
|
|
41
39
|
proxy_set_header X-Real-IP $remote_addr;
|
|
42
40
|
proxy_set_header Host $host;
|
|
@@ -44,19 +42,27 @@ server {
|
|
|
44
42
|
proxy_set_header Upgrade $http_upgrade;
|
|
45
43
|
proxy_set_header Connection "Upgrade";
|
|
46
44
|
proxy_read_timeout 300s;
|
|
45
|
+
{% else %}
|
|
46
|
+
return 503;
|
|
47
|
+
{% endif %}
|
|
47
48
|
}
|
|
48
49
|
location @ {
|
|
50
|
+
set $dstack_replica_hit 1;
|
|
51
|
+
{% if replicas %}
|
|
49
52
|
proxy_pass http://{{ domain }}.upstream;
|
|
50
53
|
proxy_set_header X-Real-IP $remote_addr;
|
|
51
54
|
proxy_set_header Host $host;
|
|
52
55
|
proxy_read_timeout 300s;
|
|
56
|
+
{% else %}
|
|
57
|
+
return 503;
|
|
58
|
+
{% endif %}
|
|
53
59
|
}
|
|
54
|
-
{% endif %}
|
|
55
60
|
|
|
56
61
|
{% if auth %}
|
|
57
62
|
location = /_dstack_auth {
|
|
58
63
|
internal;
|
|
59
64
|
if ($remote_addr = 127.0.0.1) {
|
|
65
|
+
# for requests from the gateway app, e.g. from the OpenAI-compatible API
|
|
60
66
|
return 200;
|
|
61
67
|
}
|
|
62
68
|
proxy_pass http://localhost:{{ proxy_port }}/api/auth/{{ project_name }};
|
|
@@ -11,10 +11,10 @@ from pydantic import BaseModel
|
|
|
11
11
|
|
|
12
12
|
from dstack._internal.proxy.gateway.repo.repo import GatewayProxyRepo
|
|
13
13
|
from dstack._internal.proxy.gateway.schemas.stats import PerWindowStats, ServiceStats, Stat
|
|
14
|
+
from dstack._internal.proxy.lib.errors import UnexpectedProxyError
|
|
14
15
|
from dstack._internal.utils.common import run_async
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
|
-
IGNORE_STATUSES = {403, 404}
|
|
18
18
|
WINDOWS = (30, 60, 300)
|
|
19
19
|
TTL = WINDOWS[-1]
|
|
20
20
|
EMPTY_STATS = {window: Stat(requests=0, request_time=0.0) for window in WINDOWS}
|
|
@@ -35,6 +35,7 @@ class LogEntry(BaseModel):
|
|
|
35
35
|
host: str
|
|
36
36
|
status: int
|
|
37
37
|
request_time: float
|
|
38
|
+
is_replica_hit: bool
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class StatsCollector:
|
|
@@ -87,7 +88,8 @@ class StatsCollector:
|
|
|
87
88
|
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
88
89
|
|
|
89
90
|
for entry in self._read_access_log(now - datetime.timedelta(seconds=TTL)):
|
|
90
|
-
|
|
91
|
+
# only include requests that hit or should hit a service replica
|
|
92
|
+
if not entry.is_replica_hit:
|
|
91
93
|
continue
|
|
92
94
|
|
|
93
95
|
frame_timestamp = int(entry.timestamp.timestamp())
|
|
@@ -119,7 +121,10 @@ class StatsCollector:
|
|
|
119
121
|
line = self._file.readline()
|
|
120
122
|
if not line:
|
|
121
123
|
break
|
|
122
|
-
|
|
124
|
+
cells = line.split()
|
|
125
|
+
if len(cells) == 4: # compatibility with pre-0.19.11 logs
|
|
126
|
+
cells.append("0" if cells[2] in ["403", "404"] else "1")
|
|
127
|
+
timestamp_str, host, status, request_time, dstack_replica_hit = cells
|
|
123
128
|
timestamp = datetime.datetime.fromisoformat(timestamp_str)
|
|
124
129
|
if timestamp < after:
|
|
125
130
|
continue
|
|
@@ -128,6 +133,7 @@ class StatsCollector:
|
|
|
128
133
|
host=host,
|
|
129
134
|
status=int(status),
|
|
130
135
|
request_time=float(request_time),
|
|
136
|
+
is_replica_hit=_parse_nginx_bool(dstack_replica_hit),
|
|
131
137
|
)
|
|
132
138
|
if os.fstat(self._file.fileno()).st_ino != st_ino:
|
|
133
139
|
# file was rotated
|
|
@@ -154,3 +160,11 @@ async def get_service_stats(
|
|
|
154
160
|
)
|
|
155
161
|
for service in services
|
|
156
162
|
]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _parse_nginx_bool(v: str) -> bool:
|
|
166
|
+
if v == "0":
|
|
167
|
+
return False
|
|
168
|
+
if v == "1":
|
|
169
|
+
return True
|
|
170
|
+
raise UnexpectedProxyError(f"Cannot parse boolean value: expected '0' or '1', got {v!r}")
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
|
-
from sqlalchemy import delete, select
|
|
5
|
+
from sqlalchemy import Delete, delete, select
|
|
6
6
|
from sqlalchemy.orm import joinedload
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
|
|
@@ -49,27 +49,29 @@ async def delete_metrics():
|
|
|
49
49
|
finished_timestamp_micro_cutoff = (
|
|
50
50
|
now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
|
|
51
51
|
)
|
|
52
|
+
await asyncio.gather(
|
|
53
|
+
_execute_delete_statement(
|
|
54
|
+
delete(JobMetricsPoint).where(
|
|
55
|
+
JobMetricsPoint.job_id.in_(
|
|
56
|
+
select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
57
|
+
),
|
|
58
|
+
JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
|
|
59
|
+
)
|
|
60
|
+
),
|
|
61
|
+
_execute_delete_statement(
|
|
62
|
+
delete(JobMetricsPoint).where(
|
|
63
|
+
JobMetricsPoint.job_id.in_(
|
|
64
|
+
select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
|
|
65
|
+
),
|
|
66
|
+
JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
|
|
67
|
+
)
|
|
68
|
+
),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def _execute_delete_statement(stmt: Delete) -> None:
|
|
52
73
|
async with get_session_ctx() as session:
|
|
53
|
-
await
|
|
54
|
-
session.execute(
|
|
55
|
-
delete(JobMetricsPoint).where(
|
|
56
|
-
JobMetricsPoint.job_id.in_(
|
|
57
|
-
select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
58
|
-
),
|
|
59
|
-
JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
|
|
60
|
-
)
|
|
61
|
-
),
|
|
62
|
-
session.execute(
|
|
63
|
-
delete(JobMetricsPoint).where(
|
|
64
|
-
JobMetricsPoint.job_id.in_(
|
|
65
|
-
select(JobModel.id).where(
|
|
66
|
-
JobModel.status.in_(JobStatus.finished_statuses())
|
|
67
|
-
)
|
|
68
|
-
),
|
|
69
|
-
JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
|
|
70
|
-
)
|
|
71
|
-
),
|
|
72
|
-
)
|
|
74
|
+
await session.execute(stmt)
|
|
73
75
|
await session.commit()
|
|
74
76
|
|
|
75
77
|
|
|
@@ -93,11 +93,20 @@ async def _process_next_submitted_job():
|
|
|
93
93
|
async with lock:
|
|
94
94
|
res = await session.execute(
|
|
95
95
|
select(JobModel)
|
|
96
|
+
.join(JobModel.run)
|
|
96
97
|
.where(
|
|
97
98
|
JobModel.status == JobStatus.SUBMITTED,
|
|
98
99
|
JobModel.id.not_in(lockset),
|
|
99
100
|
)
|
|
100
|
-
|
|
101
|
+
# Jobs are process in FIFO sorted by priority globally,
|
|
102
|
+
# thus runs from different project can "overtake" each other by using higher priorities.
|
|
103
|
+
# That's not a big problem as long as projects do not compete for the same compute resources.
|
|
104
|
+
# Jobs with lower priorities from other projects will be processed without major lag
|
|
105
|
+
# as long as new higher priority runs are not constantly submitted.
|
|
106
|
+
# TODO: Consider processing jobs from different projects fairly/round-robin
|
|
107
|
+
# Fully fair processing can be tricky to implement via the current DB queue as
|
|
108
|
+
# there can be many projects and we are limited by the max DB connections.
|
|
109
|
+
.order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
|
|
101
110
|
.limit(1)
|
|
102
111
|
.with_for_update(skip_locked=True)
|
|
103
112
|
)
|
|
@@ -360,16 +369,16 @@ async def _assign_job_to_pool_instance(
|
|
|
360
369
|
(instance, common_utils.get_or_error(get_instance_offer(instance)))
|
|
361
370
|
for instance in nonshared_instances
|
|
362
371
|
]
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
372
|
+
shared_instances_with_offers = get_shared_pool_instances_with_offers(
|
|
373
|
+
pool_instances=pool_instances,
|
|
374
|
+
profile=profile,
|
|
375
|
+
requirements=job.job_spec.requirements,
|
|
376
|
+
idle_only=True,
|
|
377
|
+
fleet_model=fleet_model,
|
|
378
|
+
multinode=multinode,
|
|
379
|
+
volumes=volumes,
|
|
380
|
+
)
|
|
381
|
+
instances_with_offers.extend(shared_instances_with_offers)
|
|
373
382
|
|
|
374
383
|
if len(instances_with_offers) == 0:
|
|
375
384
|
return None
|
|
@@ -572,7 +581,7 @@ def _create_instance_model_for_job(
|
|
|
572
581
|
|
|
573
582
|
|
|
574
583
|
def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
|
|
575
|
-
if offer.
|
|
584
|
+
if offer.blocks == offer.total_blocks:
|
|
576
585
|
if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
|
|
577
586
|
network_mode = NetworkMode.BRIDGE
|
|
578
587
|
else:
|