dstack 0.19.9__py3-none-any.whl → 0.19.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/config.py +1 -1
- dstack/_internal/cli/commands/project.py +161 -0
- dstack/_internal/cli/commands/ps.py +9 -2
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/core/backends/azure/compute.py +5 -2
- dstack/_internal/core/models/configurations.py +19 -3
- dstack/_internal/server/background/tasks/process_metrics.py +23 -21
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/repos.py +8 -4
- dstack/_internal/server/services/instances.py +6 -2
- dstack/_internal/server/services/jobs/configurators/base.py +3 -3
- dstack/_internal/server/services/runs.py +31 -18
- dstack/_internal/server/settings.py +1 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
- dstack/_internal/server/testing/common.py +2 -0
- dstack/_internal/server/utils/routers.py +3 -6
- dstack/_internal/settings.py +4 -0
- dstack/api/_public/runs.py +6 -3
- dstack/api/server/_runs.py +2 -0
- dstack/version.py +1 -1
- {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/METADATA +11 -6
- {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/RECORD +29 -27
- {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/WHEEL +0 -0
- {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.9.dist-info → dstack-0.19.10.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from requests import HTTPError
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
|
|
6
|
+
import dstack.api.server
|
|
7
|
+
from dstack._internal.cli.commands import BaseCommand
|
|
8
|
+
from dstack._internal.cli.utils.common import confirm_ask, console
|
|
9
|
+
from dstack._internal.core.errors import ClientError, CLIError
|
|
10
|
+
from dstack._internal.core.services.configs import ConfigManager
|
|
11
|
+
from dstack._internal.utils.logging import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ProjectCommand(BaseCommand):
|
|
17
|
+
NAME = "project"
|
|
18
|
+
DESCRIPTION = "Manage projects configs"
|
|
19
|
+
|
|
20
|
+
def _register(self):
|
|
21
|
+
super()._register()
|
|
22
|
+
subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
|
|
23
|
+
|
|
24
|
+
# Add subcommand
|
|
25
|
+
add_parser = subparsers.add_parser("add", help="Add or update a project config")
|
|
26
|
+
add_parser.add_argument(
|
|
27
|
+
"--name", type=str, help="The name of the project to configure", required=True
|
|
28
|
+
)
|
|
29
|
+
add_parser.add_argument("--url", type=str, help="Server url", required=True)
|
|
30
|
+
add_parser.add_argument("--token", type=str, help="User token", required=True)
|
|
31
|
+
add_parser.add_argument(
|
|
32
|
+
"-y",
|
|
33
|
+
"--yes",
|
|
34
|
+
help="Don't ask for confirmation (e.g. update the config)",
|
|
35
|
+
action="store_true",
|
|
36
|
+
)
|
|
37
|
+
add_parser.add_argument(
|
|
38
|
+
"-n",
|
|
39
|
+
"--no",
|
|
40
|
+
help="Don't ask for confirmation (e.g. do not update the config)",
|
|
41
|
+
action="store_true",
|
|
42
|
+
)
|
|
43
|
+
add_parser.set_defaults(subfunc=self._add)
|
|
44
|
+
|
|
45
|
+
# Delete subcommand
|
|
46
|
+
delete_parser = subparsers.add_parser("delete", help="Delete a project config")
|
|
47
|
+
delete_parser.add_argument(
|
|
48
|
+
"--name", type=str, help="The name of the project to delete", required=True
|
|
49
|
+
)
|
|
50
|
+
delete_parser.add_argument(
|
|
51
|
+
"-y",
|
|
52
|
+
"--yes",
|
|
53
|
+
help="Don't ask for confirmation",
|
|
54
|
+
action="store_true",
|
|
55
|
+
)
|
|
56
|
+
delete_parser.set_defaults(subfunc=self._delete)
|
|
57
|
+
|
|
58
|
+
# List subcommand
|
|
59
|
+
list_parser = subparsers.add_parser("list", help="List configured projects")
|
|
60
|
+
list_parser.set_defaults(subfunc=self._list)
|
|
61
|
+
|
|
62
|
+
# Set default subcommand
|
|
63
|
+
set_default_parser = subparsers.add_parser("set-default", help="Set default project")
|
|
64
|
+
set_default_parser.add_argument(
|
|
65
|
+
"name", type=str, help="The name of the project to set as default"
|
|
66
|
+
)
|
|
67
|
+
set_default_parser.set_defaults(subfunc=self._set_default)
|
|
68
|
+
|
|
69
|
+
def _command(self, args: argparse.Namespace):
|
|
70
|
+
if not hasattr(args, "subfunc"):
|
|
71
|
+
args.subfunc = self._list
|
|
72
|
+
args.subfunc(args)
|
|
73
|
+
|
|
74
|
+
def _add(self, args: argparse.Namespace):
|
|
75
|
+
config_manager = ConfigManager()
|
|
76
|
+
api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
|
|
77
|
+
try:
|
|
78
|
+
api_client.projects.get(args.name)
|
|
79
|
+
except HTTPError as e:
|
|
80
|
+
if e.response.status_code == 403:
|
|
81
|
+
raise CLIError("Forbidden. Ensure the token is valid.")
|
|
82
|
+
elif e.response.status_code == 404:
|
|
83
|
+
raise CLIError(f"Project '{args.name}' not found.")
|
|
84
|
+
else:
|
|
85
|
+
raise e
|
|
86
|
+
default_project = config_manager.get_project_config()
|
|
87
|
+
if (
|
|
88
|
+
default_project is None
|
|
89
|
+
or default_project.name != args.name
|
|
90
|
+
or default_project.url != args.url
|
|
91
|
+
or default_project.token != args.token
|
|
92
|
+
):
|
|
93
|
+
set_it_as_default = (
|
|
94
|
+
(
|
|
95
|
+
args.yes
|
|
96
|
+
or not default_project
|
|
97
|
+
or confirm_ask(f"Set '{args.name}' as your default project?")
|
|
98
|
+
)
|
|
99
|
+
if not args.no
|
|
100
|
+
else False
|
|
101
|
+
)
|
|
102
|
+
config_manager.configure_project(
|
|
103
|
+
name=args.name, url=args.url, token=args.token, default=set_it_as_default
|
|
104
|
+
)
|
|
105
|
+
config_manager.save()
|
|
106
|
+
logger.info(
|
|
107
|
+
f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _delete(self, args: argparse.Namespace):
|
|
111
|
+
config_manager = ConfigManager()
|
|
112
|
+
if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
|
|
113
|
+
config_manager.delete_project(args.name)
|
|
114
|
+
config_manager.save()
|
|
115
|
+
console.print("[grey58]OK[/]")
|
|
116
|
+
|
|
117
|
+
def _list(self, args: argparse.Namespace):
|
|
118
|
+
config_manager = ConfigManager()
|
|
119
|
+
default_project = config_manager.get_project_config()
|
|
120
|
+
|
|
121
|
+
table = Table(box=None)
|
|
122
|
+
table.add_column("PROJECT", style="bold", no_wrap=True)
|
|
123
|
+
table.add_column("URL", style="grey58")
|
|
124
|
+
table.add_column("USER", style="grey58")
|
|
125
|
+
table.add_column("DEFAULT", justify="center")
|
|
126
|
+
|
|
127
|
+
for project_name in config_manager.list_projects():
|
|
128
|
+
project_config = config_manager.get_project_config(project_name)
|
|
129
|
+
is_default = project_name == default_project.name if default_project else False
|
|
130
|
+
|
|
131
|
+
# Get username from API
|
|
132
|
+
try:
|
|
133
|
+
api_client = dstack.api.server.APIClient(
|
|
134
|
+
base_url=project_config.url, token=project_config.token
|
|
135
|
+
)
|
|
136
|
+
user_info = api_client.users.get_my_user()
|
|
137
|
+
username = user_info.username
|
|
138
|
+
except ClientError:
|
|
139
|
+
username = "(invalid token)"
|
|
140
|
+
|
|
141
|
+
table.add_row(
|
|
142
|
+
project_name,
|
|
143
|
+
project_config.url,
|
|
144
|
+
username,
|
|
145
|
+
"✓" if is_default else "",
|
|
146
|
+
style="bold" if is_default else None,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
console.print(table)
|
|
150
|
+
|
|
151
|
+
def _set_default(self, args: argparse.Namespace):
|
|
152
|
+
config_manager = ConfigManager()
|
|
153
|
+
project_config = config_manager.get_project_config(args.name)
|
|
154
|
+
if project_config is None:
|
|
155
|
+
raise CLIError(f"Project '{args.name}' not found")
|
|
156
|
+
|
|
157
|
+
config_manager.configure_project(
|
|
158
|
+
name=args.name, url=project_config.url, token=project_config.token, default=True
|
|
159
|
+
)
|
|
160
|
+
config_manager.save()
|
|
161
|
+
console.print("[grey58]OK[/]")
|
|
@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
|
|
|
36
36
|
help="Watch statuses of runs in realtime",
|
|
37
37
|
action="store_true",
|
|
38
38
|
)
|
|
39
|
+
self._parser.add_argument(
|
|
40
|
+
"-n",
|
|
41
|
+
"--last",
|
|
42
|
+
help="Show only the last N runs. Implies --all",
|
|
43
|
+
type=int,
|
|
44
|
+
default=None,
|
|
45
|
+
)
|
|
39
46
|
|
|
40
47
|
def _command(self, args: argparse.Namespace):
|
|
41
48
|
super()._command(args)
|
|
42
|
-
runs = self.api.runs.list(all=args.all)
|
|
49
|
+
runs = self.api.runs.list(all=args.all, limit=args.last)
|
|
43
50
|
if not args.watch:
|
|
44
51
|
console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
|
|
45
52
|
return
|
|
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
|
|
|
49
56
|
while True:
|
|
50
57
|
live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
|
|
51
58
|
time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
|
|
52
|
-
runs = self.api.runs.list(all=args.all)
|
|
59
|
+
runs = self.api.runs.list(all=args.all, limit=args.last)
|
|
53
60
|
except KeyboardInterrupt:
|
|
54
61
|
pass
|
dstack/_internal/cli/main.py
CHANGED
|
@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
|
|
|
15
15
|
from dstack._internal.cli.commands.logs import LogsCommand
|
|
16
16
|
from dstack._internal.cli.commands.metrics import MetricsCommand
|
|
17
17
|
from dstack._internal.cli.commands.offer import OfferCommand
|
|
18
|
+
from dstack._internal.cli.commands.project import ProjectCommand
|
|
18
19
|
from dstack._internal.cli.commands.ps import PsCommand
|
|
19
20
|
from dstack._internal.cli.commands.server import ServerCommand
|
|
20
21
|
from dstack._internal.cli.commands.stats import StatsCommand
|
|
@@ -69,6 +70,7 @@ def main():
|
|
|
69
70
|
OfferCommand.register(subparsers)
|
|
70
71
|
LogsCommand.register(subparsers)
|
|
71
72
|
MetricsCommand.register(subparsers)
|
|
73
|
+
ProjectCommand.register(subparsers)
|
|
72
74
|
PsCommand.register(subparsers)
|
|
73
75
|
ServerCommand.register(subparsers)
|
|
74
76
|
StatsCommand.register(subparsers)
|
|
@@ -391,9 +391,12 @@ class VMImageVariant(enum.Enum):
|
|
|
391
391
|
|
|
392
392
|
|
|
393
393
|
_SUPPORTED_VM_SERIES_PATTERNS = [
|
|
394
|
-
|
|
395
|
-
|
|
394
|
+
# TODO: Support newer CPU series (Dsv6, Esv6).
|
|
395
|
+
# They are NVMe-only and require marking the VM image as NVMe.
|
|
396
|
+
r"D(\d+)s_v3", # Dsv3-series (general purpose)
|
|
397
|
+
r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
|
|
396
398
|
r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
|
|
399
|
+
r"F(\d+)s_v2", # Fsv2-series (compute optimized)
|
|
397
400
|
r"NC(\d+)s_v3", # NCv3-series [V100 16GB]
|
|
398
401
|
r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4]
|
|
399
402
|
r"ND(\d+)rs_v2", # NDv2-series [8xV100 32GB]
|
|
@@ -23,6 +23,9 @@ ValidPort = conint(gt=0, le=65536)
|
|
|
23
23
|
MAX_INT64 = 2**63 - 1
|
|
24
24
|
SERVICE_HTTPS_DEFAULT = True
|
|
25
25
|
STRIP_PREFIX_DEFAULT = True
|
|
26
|
+
RUN_PRIOTIRY_MIN = 0
|
|
27
|
+
RUN_PRIOTIRY_MAX = 100
|
|
28
|
+
RUN_PRIORITY_DEFAULT = 0
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
class RunConfigurationType(str, Enum):
|
|
@@ -77,7 +80,8 @@ class ScalingSpec(CoreModel):
|
|
|
77
80
|
Field(
|
|
78
81
|
description="The target value of the metric. "
|
|
79
82
|
"The number of replicas is calculated based on this number and automatically adjusts "
|
|
80
|
-
"(scales up or down) as this metric changes"
|
|
83
|
+
"(scales up or down) as this metric changes",
|
|
84
|
+
gt=0,
|
|
81
85
|
),
|
|
82
86
|
]
|
|
83
87
|
scale_up_delay: Annotated[
|
|
@@ -221,14 +225,26 @@ class BaseRunConfiguration(CoreModel):
|
|
|
221
225
|
)
|
|
222
226
|
),
|
|
223
227
|
] = None
|
|
224
|
-
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
|
|
225
|
-
setup: CommandsList = []
|
|
226
228
|
resources: Annotated[
|
|
227
229
|
ResourcesSpec, Field(description="The resources requirements to run the configuration")
|
|
228
230
|
] = ResourcesSpec()
|
|
231
|
+
priority: Annotated[
|
|
232
|
+
Optional[int],
|
|
233
|
+
Field(
|
|
234
|
+
ge=RUN_PRIOTIRY_MIN,
|
|
235
|
+
le=RUN_PRIOTIRY_MAX,
|
|
236
|
+
description=(
|
|
237
|
+
f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
|
|
238
|
+
" `dstack` tries to provision runs with higher priority first."
|
|
239
|
+
f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
|
|
240
|
+
),
|
|
241
|
+
),
|
|
242
|
+
] = None
|
|
229
243
|
volumes: Annotated[
|
|
230
244
|
List[Union[MountPoint, str]], Field(description="The volumes mount points")
|
|
231
245
|
] = []
|
|
246
|
+
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
|
|
247
|
+
setup: CommandsList = []
|
|
232
248
|
|
|
233
249
|
@validator("python", pre=True, always=True)
|
|
234
250
|
def convert_python(cls, v, values) -> Optional[PythonVersion]:
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
|
-
from sqlalchemy import delete, select
|
|
5
|
+
from sqlalchemy import Delete, delete, select
|
|
6
6
|
from sqlalchemy.orm import joinedload
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
|
|
@@ -49,27 +49,29 @@ async def delete_metrics():
|
|
|
49
49
|
finished_timestamp_micro_cutoff = (
|
|
50
50
|
now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
|
|
51
51
|
)
|
|
52
|
+
await asyncio.gather(
|
|
53
|
+
_execute_delete_statement(
|
|
54
|
+
delete(JobMetricsPoint).where(
|
|
55
|
+
JobMetricsPoint.job_id.in_(
|
|
56
|
+
select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
57
|
+
),
|
|
58
|
+
JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
|
|
59
|
+
)
|
|
60
|
+
),
|
|
61
|
+
_execute_delete_statement(
|
|
62
|
+
delete(JobMetricsPoint).where(
|
|
63
|
+
JobMetricsPoint.job_id.in_(
|
|
64
|
+
select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
|
|
65
|
+
),
|
|
66
|
+
JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
|
|
67
|
+
)
|
|
68
|
+
),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def _execute_delete_statement(stmt: Delete) -> None:
|
|
52
73
|
async with get_session_ctx() as session:
|
|
53
|
-
await
|
|
54
|
-
session.execute(
|
|
55
|
-
delete(JobMetricsPoint).where(
|
|
56
|
-
JobMetricsPoint.job_id.in_(
|
|
57
|
-
select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
58
|
-
),
|
|
59
|
-
JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
|
|
60
|
-
)
|
|
61
|
-
),
|
|
62
|
-
session.execute(
|
|
63
|
-
delete(JobMetricsPoint).where(
|
|
64
|
-
JobMetricsPoint.job_id.in_(
|
|
65
|
-
select(JobModel.id).where(
|
|
66
|
-
JobModel.status.in_(JobStatus.finished_statuses())
|
|
67
|
-
)
|
|
68
|
-
),
|
|
69
|
-
JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
|
|
70
|
-
)
|
|
71
|
-
),
|
|
72
|
-
)
|
|
74
|
+
await session.execute(stmt)
|
|
73
75
|
await session.commit()
|
|
74
76
|
|
|
75
77
|
|
|
@@ -93,11 +93,20 @@ async def _process_next_submitted_job():
|
|
|
93
93
|
async with lock:
|
|
94
94
|
res = await session.execute(
|
|
95
95
|
select(JobModel)
|
|
96
|
+
.join(JobModel.run)
|
|
96
97
|
.where(
|
|
97
98
|
JobModel.status == JobStatus.SUBMITTED,
|
|
98
99
|
JobModel.id.not_in(lockset),
|
|
99
100
|
)
|
|
100
|
-
|
|
101
|
+
# Jobs are process in FIFO sorted by priority globally,
|
|
102
|
+
# thus runs from different project can "overtake" each other by using higher priorities.
|
|
103
|
+
# That's not a big problem as long as projects do not compete for the same compute resources.
|
|
104
|
+
# Jobs with lower priorities from other projects will be processed without major lag
|
|
105
|
+
# as long as new higher priority runs are not constantly submitted.
|
|
106
|
+
# TODO: Consider processing jobs from different projects fairly/round-robin
|
|
107
|
+
# Fully fair processing can be tricky to implement via the current DB queue as
|
|
108
|
+
# there can be many projects and we are limited by the max DB connections.
|
|
109
|
+
.order_by(RunModel.priority.desc(), JobModel.last_processed_at.asc())
|
|
101
110
|
.limit(1)
|
|
102
111
|
.with_for_update(skip_locked=True)
|
|
103
112
|
)
|
|
@@ -360,16 +369,16 @@ async def _assign_job_to_pool_instance(
|
|
|
360
369
|
(instance, common_utils.get_or_error(get_instance_offer(instance)))
|
|
361
370
|
for instance in nonshared_instances
|
|
362
371
|
]
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
372
|
+
shared_instances_with_offers = get_shared_pool_instances_with_offers(
|
|
373
|
+
pool_instances=pool_instances,
|
|
374
|
+
profile=profile,
|
|
375
|
+
requirements=job.job_spec.requirements,
|
|
376
|
+
idle_only=True,
|
|
377
|
+
fleet_model=fleet_model,
|
|
378
|
+
multinode=multinode,
|
|
379
|
+
volumes=volumes,
|
|
380
|
+
)
|
|
381
|
+
instances_with_offers.extend(shared_instances_with_offers)
|
|
373
382
|
|
|
374
383
|
if len(instances_with_offers) == 0:
|
|
375
384
|
return None
|
|
@@ -572,7 +581,7 @@ def _create_instance_model_for_job(
|
|
|
572
581
|
|
|
573
582
|
|
|
574
583
|
def _prepare_job_runtime_data(offer: InstanceOfferWithAvailability) -> JobRuntimeData:
|
|
575
|
-
if offer.
|
|
584
|
+
if offer.blocks == offer.total_blocks:
|
|
576
585
|
if env_utils.get_bool("DSTACK_FORCE_BRIDGE_NETWORK"):
|
|
577
586
|
network_mode = NetworkMode.BRIDGE
|
|
578
587
|
else:
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Add RunModel.priority
|
|
2
|
+
|
|
3
|
+
Revision ID: bca2fdf130bf
|
|
4
|
+
Revises: 20166748b60c
|
|
5
|
+
Create Date: 2025-05-14 15:24:21.269775
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
from alembic import op
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = "bca2fdf130bf"
|
|
14
|
+
down_revision = "20166748b60c"
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
22
|
+
batch_op.add_column(sa.Column("priority", sa.Integer(), nullable=True))
|
|
23
|
+
batch_op.execute("UPDATE runs SET priority = 0")
|
|
24
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
25
|
+
batch_op.alter_column("priority", nullable=False)
|
|
26
|
+
# ### end Alembic commands ###
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def downgrade() -> None:
|
|
30
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
31
|
+
with op.batch_alter_table("runs", schema=None) as batch_op:
|
|
32
|
+
batch_op.drop_column("priority")
|
|
33
|
+
|
|
34
|
+
# ### end Alembic commands ###
|
|
@@ -348,6 +348,7 @@ class RunModel(BaseModel):
|
|
|
348
348
|
resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0)
|
|
349
349
|
run_spec: Mapped[str] = mapped_column(Text)
|
|
350
350
|
service_spec: Mapped[Optional[str]] = mapped_column(Text)
|
|
351
|
+
priority: Mapped[int] = mapped_column(Integer, default=0)
|
|
351
352
|
|
|
352
353
|
jobs: Mapped[List["JobModel"]] = relationship(
|
|
353
354
|
back_populates="run", lazy="selectin", order_by="[JobModel.replica_num, JobModel.job_num]"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Tuple
|
|
2
2
|
|
|
3
3
|
from fastapi import APIRouter, Depends, Request, UploadFile
|
|
4
|
+
from humanize import naturalsize
|
|
4
5
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
6
|
|
|
6
7
|
from dstack._internal.core.errors import ResourceNotExistsError, ServerClientError
|
|
@@ -14,9 +15,10 @@ from dstack._internal.server.schemas.repos import (
|
|
|
14
15
|
)
|
|
15
16
|
from dstack._internal.server.security.permissions import ProjectMember
|
|
16
17
|
from dstack._internal.server.services import repos
|
|
18
|
+
from dstack._internal.server.settings import SERVER_CODE_UPLOAD_LIMIT
|
|
17
19
|
from dstack._internal.server.utils.routers import (
|
|
18
20
|
get_base_api_additional_responses,
|
|
19
|
-
|
|
21
|
+
get_request_size,
|
|
20
22
|
)
|
|
21
23
|
|
|
22
24
|
router = APIRouter(
|
|
@@ -94,10 +96,12 @@ async def upload_code(
|
|
|
94
96
|
session: AsyncSession = Depends(get_session),
|
|
95
97
|
user_project: Tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
96
98
|
):
|
|
97
|
-
|
|
99
|
+
request_size = get_request_size(request)
|
|
100
|
+
if SERVER_CODE_UPLOAD_LIMIT > 0 and request_size > SERVER_CODE_UPLOAD_LIMIT:
|
|
98
101
|
raise ServerClientError(
|
|
99
|
-
"Repo diff size exceeds the limit of
|
|
100
|
-
"Use .gitignore to exclude large files from the repo."
|
|
102
|
+
f"Repo diff size is {naturalsize(request_size)}, which exceeds the limit of "
|
|
103
|
+
f"{naturalsize(SERVER_CODE_UPLOAD_LIMIT)}. Use .gitignore to exclude large files from the repo. This "
|
|
104
|
+
f"limit can be modified by setting the DSTACK_SERVER_CODE_UPLOAD_LIMIT_BYTES environment variable"
|
|
101
105
|
)
|
|
102
106
|
_, project = user_project
|
|
103
107
|
await repos.upload_code(
|
|
@@ -235,6 +235,7 @@ def get_shared_pool_instances_with_offers(
|
|
|
235
235
|
*,
|
|
236
236
|
idle_only: bool = False,
|
|
237
237
|
fleet_model: Optional[FleetModel] = None,
|
|
238
|
+
multinode: bool = False,
|
|
238
239
|
volumes: Optional[List[List[Volume]]] = None,
|
|
239
240
|
) -> list[tuple[InstanceModel, InstanceOfferWithAvailability]]:
|
|
240
241
|
instances_with_offers: list[tuple[InstanceModel, InstanceOfferWithAvailability]] = []
|
|
@@ -243,19 +244,22 @@ def get_shared_pool_instances_with_offers(
|
|
|
243
244
|
pool_instances=pool_instances,
|
|
244
245
|
profile=profile,
|
|
245
246
|
fleet_model=fleet_model,
|
|
246
|
-
multinode=
|
|
247
|
+
multinode=multinode,
|
|
247
248
|
volumes=volumes,
|
|
248
249
|
shared=True,
|
|
249
250
|
)
|
|
250
251
|
for instance in filtered_instances:
|
|
251
252
|
if idle_only and instance.status not in [InstanceStatus.IDLE, InstanceStatus.BUSY]:
|
|
252
253
|
continue
|
|
254
|
+
if multinode and instance.busy_blocks > 0:
|
|
255
|
+
continue
|
|
253
256
|
offer = get_instance_offer(instance)
|
|
254
257
|
if offer is None:
|
|
255
258
|
continue
|
|
256
259
|
total_blocks = common_utils.get_or_error(instance.total_blocks)
|
|
257
260
|
idle_blocks = total_blocks - instance.busy_blocks
|
|
258
|
-
|
|
261
|
+
min_blocks = total_blocks if multinode else 1
|
|
262
|
+
for blocks in range(min_blocks, total_blocks + 1):
|
|
259
263
|
shared_offer = generate_shared_offer(offer, blocks, total_blocks)
|
|
260
264
|
catalog_item = offer_to_catalog_item(shared_offer)
|
|
261
265
|
if gpuhunt.matches(catalog_item, query_filter):
|
|
@@ -6,7 +6,7 @@ from typing import Dict, List, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
from cachetools import TTLCache, cached
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
from dstack._internal import settings
|
|
10
10
|
from dstack._internal.core.errors import DockerRegistryError, ServerClientError
|
|
11
11
|
from dstack._internal.core.models.common import RegistryAuth
|
|
12
12
|
from dstack._internal.core.models.configurations import (
|
|
@@ -53,14 +53,14 @@ def get_default_image(python_version: str, nvcc: bool = False) -> str:
|
|
|
53
53
|
suffix = ""
|
|
54
54
|
if nvcc:
|
|
55
55
|
suffix = "-devel"
|
|
56
|
-
return f"
|
|
56
|
+
return f"{settings.DSTACK_BASE_IMAGE}:py{python_version}-{settings.DSTACK_BASE_IMAGE_VERSION}-cuda-12.1{suffix}"
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
class JobConfigurator(ABC):
|
|
60
60
|
TYPE: RunConfigurationType
|
|
61
61
|
|
|
62
62
|
_image_config: Optional[ImageConfig] = None
|
|
63
|
-
# JobSSHKey should be shared for all jobs in a replica for inter-node
|
|
63
|
+
# JobSSHKey should be shared for all jobs in a replica for inter-node communication.
|
|
64
64
|
_job_ssh_key: Optional[JobSSHKey] = None
|
|
65
65
|
|
|
66
66
|
def __init__(self, run_spec: RunSpec):
|
|
@@ -16,7 +16,7 @@ from dstack._internal.core.errors import (
|
|
|
16
16
|
ServerClientError,
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.core.models.common import ApplyAction
|
|
19
|
-
from dstack._internal.core.models.configurations import AnyRunConfiguration
|
|
19
|
+
from dstack._internal.core.models.configurations import RUN_PRIORITY_DEFAULT, AnyRunConfiguration
|
|
20
20
|
from dstack._internal.core.models.instances import (
|
|
21
21
|
InstanceAvailability,
|
|
22
22
|
InstanceOfferWithAvailability,
|
|
@@ -434,7 +434,12 @@ async def apply_plan(
|
|
|
434
434
|
# FIXME: potentially long write transaction
|
|
435
435
|
# Avoid getting run_model after update
|
|
436
436
|
await session.execute(
|
|
437
|
-
update(RunModel)
|
|
437
|
+
update(RunModel)
|
|
438
|
+
.where(RunModel.id == current_resource.id)
|
|
439
|
+
.values(
|
|
440
|
+
run_spec=run_spec.json(),
|
|
441
|
+
priority=run_spec.configuration.priority,
|
|
442
|
+
)
|
|
438
443
|
)
|
|
439
444
|
run = await get_run_by_name(
|
|
440
445
|
session=session,
|
|
@@ -495,6 +500,7 @@ async def submit_run(
|
|
|
495
500
|
status=RunStatus.SUBMITTED,
|
|
496
501
|
run_spec=run_spec.json(),
|
|
497
502
|
last_processed_at=submitted_at,
|
|
503
|
+
priority=run_spec.configuration.priority,
|
|
498
504
|
)
|
|
499
505
|
session.add(run_model)
|
|
500
506
|
|
|
@@ -721,15 +727,15 @@ async def _get_pool_offers(
|
|
|
721
727
|
pool_instances = [i for i in pool_instances if i.id not in detaching_instances_ids]
|
|
722
728
|
multinode = job.job_spec.jobs_per_replica > 1
|
|
723
729
|
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
730
|
+
shared_instances_with_offers = get_shared_pool_instances_with_offers(
|
|
731
|
+
pool_instances=pool_instances,
|
|
732
|
+
profile=run_spec.merged_profile,
|
|
733
|
+
requirements=job.job_spec.requirements,
|
|
734
|
+
volumes=volumes,
|
|
735
|
+
multinode=multinode,
|
|
736
|
+
)
|
|
737
|
+
for _, offer in shared_instances_with_offers:
|
|
738
|
+
pool_offers.append(offer)
|
|
733
739
|
|
|
734
740
|
nonshared_instances = filter_pool_instances(
|
|
735
741
|
pool_instances=pool_instances,
|
|
@@ -852,6 +858,13 @@ def _get_job_submission_cost(job_submission: JobSubmission) -> float:
|
|
|
852
858
|
|
|
853
859
|
|
|
854
860
|
def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
861
|
+
# This function may set defaults for null run_spec values,
|
|
862
|
+
# although most defaults are resolved when building job_spec
|
|
863
|
+
# so that we can keep both the original user-supplied value (null in run_spec)
|
|
864
|
+
# and the default in job_spec.
|
|
865
|
+
# If a property is stored in job_spec - resolve the default there.
|
|
866
|
+
# Server defaults are preferable over client defaults so that
|
|
867
|
+
# the defaults depend on the server version, not the client version.
|
|
855
868
|
if run_spec.run_name is not None:
|
|
856
869
|
validate_dstack_resource_name(run_spec.run_name)
|
|
857
870
|
for mount_point in run_spec.configuration.volumes:
|
|
@@ -875,11 +888,14 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
|
|
|
875
888
|
raise ServerClientError(
|
|
876
889
|
f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
|
|
877
890
|
)
|
|
891
|
+
if run_spec.configuration.priority is None:
|
|
892
|
+
run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
|
|
878
893
|
set_resources_defaults(run_spec.configuration.resources)
|
|
879
894
|
|
|
880
895
|
|
|
881
896
|
_UPDATABLE_SPEC_FIELDS = ["repo_code_hash", "configuration"]
|
|
882
|
-
|
|
897
|
+
_CONF_UPDATABLE_FIELDS = ["priority"]
|
|
898
|
+
_TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = {
|
|
883
899
|
"dev-environment": ["inactivity_duration"],
|
|
884
900
|
# Most service fields can be updated via replica redeployment.
|
|
885
901
|
# TODO: Allow updating other fields when rolling deployment is supported.
|
|
@@ -915,12 +931,9 @@ def _check_can_update_configuration(
|
|
|
915
931
|
raise ServerClientError(
|
|
916
932
|
f"Configuration type changed from {current.type} to {new.type}, cannot update"
|
|
917
933
|
)
|
|
918
|
-
updatable_fields =
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
f"Can only update {', '.join(_CONF_TYPE_TO_UPDATABLE_FIELDS)} configurations."
|
|
922
|
-
f" Not {new.type}"
|
|
923
|
-
)
|
|
934
|
+
updatable_fields = _CONF_UPDATABLE_FIELDS + _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS.get(
|
|
935
|
+
new.type, []
|
|
936
|
+
)
|
|
924
937
|
diff = diff_models(current, new)
|
|
925
938
|
changed_fields = list(diff.keys())
|
|
926
939
|
for key in changed_fields:
|
|
@@ -85,6 +85,7 @@ DEFAULT_SERVICE_CLIENT_MAX_BODY_SIZE = int(
|
|
|
85
85
|
USER_PROJECT_DEFAULT_QUOTA = int(os.getenv("DSTACK_USER_PROJECT_DEFAULT_QUOTA", 10))
|
|
86
86
|
FORBID_SERVICES_WITHOUT_GATEWAY = os.getenv("DSTACK_FORBID_SERVICES_WITHOUT_GATEWAY") is not None
|
|
87
87
|
|
|
88
|
+
SERVER_CODE_UPLOAD_LIMIT = int(os.getenv("DSTACK_SERVER_CODE_UPLOAD_LIMIT", 2 * 2**20))
|
|
88
89
|
|
|
89
90
|
# Development settings
|
|
90
91
|
|