dstack 0.19.8__py3-none-any.whl → 0.19.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/config.py +1 -1
- dstack/_internal/cli/commands/project.py +161 -0
- dstack/_internal/cli/commands/ps.py +9 -2
- dstack/_internal/cli/main.py +2 -0
- dstack/_internal/cli/services/configurators/run.py +18 -11
- dstack/_internal/cli/utils/run.py +7 -2
- dstack/_internal/core/backends/azure/compute.py +5 -2
- dstack/_internal/core/backends/cudo/compute.py +1 -1
- dstack/_internal/core/backends/nebius/fabrics.py +1 -0
- dstack/_internal/core/backends/nebius/models.py +1 -1
- dstack/_internal/core/models/configurations.py +19 -3
- dstack/_internal/core/models/resources.py +1 -1
- dstack/_internal/core/models/runs.py +19 -7
- dstack/_internal/server/background/tasks/process_metrics.py +30 -11
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
- dstack/_internal/server/models.py +6 -1
- dstack/_internal/server/routers/repos.py +8 -4
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/instances.py +6 -2
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +3 -3
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +33 -20
- dstack/_internal/server/settings.py +21 -1
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
- dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
- dstack/_internal/server/testing/common.py +4 -0
- dstack/_internal/server/utils/routers.py +3 -6
- dstack/_internal/settings.py +4 -0
- dstack/api/_public/runs.py +6 -3
- dstack/api/server/_runs.py +6 -0
- dstack/version.py +1 -1
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/METADATA +46 -34
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/RECORD +42 -38
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/WHEEL +0 -0
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from requests import HTTPError
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
|
|
6
|
+
import dstack.api.server
|
|
7
|
+
from dstack._internal.cli.commands import BaseCommand
|
|
8
|
+
from dstack._internal.cli.utils.common import confirm_ask, console
|
|
9
|
+
from dstack._internal.core.errors import ClientError, CLIError
|
|
10
|
+
from dstack._internal.core.services.configs import ConfigManager
|
|
11
|
+
from dstack._internal.utils.logging import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ProjectCommand(BaseCommand):
|
|
17
|
+
NAME = "project"
|
|
18
|
+
DESCRIPTION = "Manage projects configs"
|
|
19
|
+
|
|
20
|
+
def _register(self):
|
|
21
|
+
super()._register()
|
|
22
|
+
subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
|
|
23
|
+
|
|
24
|
+
# Add subcommand
|
|
25
|
+
add_parser = subparsers.add_parser("add", help="Add or update a project config")
|
|
26
|
+
add_parser.add_argument(
|
|
27
|
+
"--name", type=str, help="The name of the project to configure", required=True
|
|
28
|
+
)
|
|
29
|
+
add_parser.add_argument("--url", type=str, help="Server url", required=True)
|
|
30
|
+
add_parser.add_argument("--token", type=str, help="User token", required=True)
|
|
31
|
+
add_parser.add_argument(
|
|
32
|
+
"-y",
|
|
33
|
+
"--yes",
|
|
34
|
+
help="Don't ask for confirmation (e.g. update the config)",
|
|
35
|
+
action="store_true",
|
|
36
|
+
)
|
|
37
|
+
add_parser.add_argument(
|
|
38
|
+
"-n",
|
|
39
|
+
"--no",
|
|
40
|
+
help="Don't ask for confirmation (e.g. do not update the config)",
|
|
41
|
+
action="store_true",
|
|
42
|
+
)
|
|
43
|
+
add_parser.set_defaults(subfunc=self._add)
|
|
44
|
+
|
|
45
|
+
# Delete subcommand
|
|
46
|
+
delete_parser = subparsers.add_parser("delete", help="Delete a project config")
|
|
47
|
+
delete_parser.add_argument(
|
|
48
|
+
"--name", type=str, help="The name of the project to delete", required=True
|
|
49
|
+
)
|
|
50
|
+
delete_parser.add_argument(
|
|
51
|
+
"-y",
|
|
52
|
+
"--yes",
|
|
53
|
+
help="Don't ask for confirmation",
|
|
54
|
+
action="store_true",
|
|
55
|
+
)
|
|
56
|
+
delete_parser.set_defaults(subfunc=self._delete)
|
|
57
|
+
|
|
58
|
+
# List subcommand
|
|
59
|
+
list_parser = subparsers.add_parser("list", help="List configured projects")
|
|
60
|
+
list_parser.set_defaults(subfunc=self._list)
|
|
61
|
+
|
|
62
|
+
# Set default subcommand
|
|
63
|
+
set_default_parser = subparsers.add_parser("set-default", help="Set default project")
|
|
64
|
+
set_default_parser.add_argument(
|
|
65
|
+
"name", type=str, help="The name of the project to set as default"
|
|
66
|
+
)
|
|
67
|
+
set_default_parser.set_defaults(subfunc=self._set_default)
|
|
68
|
+
|
|
69
|
+
def _command(self, args: argparse.Namespace):
|
|
70
|
+
if not hasattr(args, "subfunc"):
|
|
71
|
+
args.subfunc = self._list
|
|
72
|
+
args.subfunc(args)
|
|
73
|
+
|
|
74
|
+
def _add(self, args: argparse.Namespace):
|
|
75
|
+
config_manager = ConfigManager()
|
|
76
|
+
api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
|
|
77
|
+
try:
|
|
78
|
+
api_client.projects.get(args.name)
|
|
79
|
+
except HTTPError as e:
|
|
80
|
+
if e.response.status_code == 403:
|
|
81
|
+
raise CLIError("Forbidden. Ensure the token is valid.")
|
|
82
|
+
elif e.response.status_code == 404:
|
|
83
|
+
raise CLIError(f"Project '{args.name}' not found.")
|
|
84
|
+
else:
|
|
85
|
+
raise e
|
|
86
|
+
default_project = config_manager.get_project_config()
|
|
87
|
+
if (
|
|
88
|
+
default_project is None
|
|
89
|
+
or default_project.name != args.name
|
|
90
|
+
or default_project.url != args.url
|
|
91
|
+
or default_project.token != args.token
|
|
92
|
+
):
|
|
93
|
+
set_it_as_default = (
|
|
94
|
+
(
|
|
95
|
+
args.yes
|
|
96
|
+
or not default_project
|
|
97
|
+
or confirm_ask(f"Set '{args.name}' as your default project?")
|
|
98
|
+
)
|
|
99
|
+
if not args.no
|
|
100
|
+
else False
|
|
101
|
+
)
|
|
102
|
+
config_manager.configure_project(
|
|
103
|
+
name=args.name, url=args.url, token=args.token, default=set_it_as_default
|
|
104
|
+
)
|
|
105
|
+
config_manager.save()
|
|
106
|
+
logger.info(
|
|
107
|
+
f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _delete(self, args: argparse.Namespace):
|
|
111
|
+
config_manager = ConfigManager()
|
|
112
|
+
if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
|
|
113
|
+
config_manager.delete_project(args.name)
|
|
114
|
+
config_manager.save()
|
|
115
|
+
console.print("[grey58]OK[/]")
|
|
116
|
+
|
|
117
|
+
def _list(self, args: argparse.Namespace):
|
|
118
|
+
config_manager = ConfigManager()
|
|
119
|
+
default_project = config_manager.get_project_config()
|
|
120
|
+
|
|
121
|
+
table = Table(box=None)
|
|
122
|
+
table.add_column("PROJECT", style="bold", no_wrap=True)
|
|
123
|
+
table.add_column("URL", style="grey58")
|
|
124
|
+
table.add_column("USER", style="grey58")
|
|
125
|
+
table.add_column("DEFAULT", justify="center")
|
|
126
|
+
|
|
127
|
+
for project_name in config_manager.list_projects():
|
|
128
|
+
project_config = config_manager.get_project_config(project_name)
|
|
129
|
+
is_default = project_name == default_project.name if default_project else False
|
|
130
|
+
|
|
131
|
+
# Get username from API
|
|
132
|
+
try:
|
|
133
|
+
api_client = dstack.api.server.APIClient(
|
|
134
|
+
base_url=project_config.url, token=project_config.token
|
|
135
|
+
)
|
|
136
|
+
user_info = api_client.users.get_my_user()
|
|
137
|
+
username = user_info.username
|
|
138
|
+
except ClientError:
|
|
139
|
+
username = "(invalid token)"
|
|
140
|
+
|
|
141
|
+
table.add_row(
|
|
142
|
+
project_name,
|
|
143
|
+
project_config.url,
|
|
144
|
+
username,
|
|
145
|
+
"✓" if is_default else "",
|
|
146
|
+
style="bold" if is_default else None,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
console.print(table)
|
|
150
|
+
|
|
151
|
+
def _set_default(self, args: argparse.Namespace):
|
|
152
|
+
config_manager = ConfigManager()
|
|
153
|
+
project_config = config_manager.get_project_config(args.name)
|
|
154
|
+
if project_config is None:
|
|
155
|
+
raise CLIError(f"Project '{args.name}' not found")
|
|
156
|
+
|
|
157
|
+
config_manager.configure_project(
|
|
158
|
+
name=args.name, url=project_config.url, token=project_config.token, default=True
|
|
159
|
+
)
|
|
160
|
+
config_manager.save()
|
|
161
|
+
console.print("[grey58]OK[/]")
|
|
@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
|
|
|
36
36
|
help="Watch statuses of runs in realtime",
|
|
37
37
|
action="store_true",
|
|
38
38
|
)
|
|
39
|
+
self._parser.add_argument(
|
|
40
|
+
"-n",
|
|
41
|
+
"--last",
|
|
42
|
+
help="Show only the last N runs. Implies --all",
|
|
43
|
+
type=int,
|
|
44
|
+
default=None,
|
|
45
|
+
)
|
|
39
46
|
|
|
40
47
|
def _command(self, args: argparse.Namespace):
|
|
41
48
|
super()._command(args)
|
|
42
|
-
runs = self.api.runs.list(all=args.all)
|
|
49
|
+
runs = self.api.runs.list(all=args.all, limit=args.last)
|
|
43
50
|
if not args.watch:
|
|
44
51
|
console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
|
|
45
52
|
return
|
|
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
|
|
|
49
56
|
while True:
|
|
50
57
|
live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
|
|
51
58
|
time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
|
|
52
|
-
runs = self.api.runs.list(all=args.all)
|
|
59
|
+
runs = self.api.runs.list(all=args.all, limit=args.last)
|
|
53
60
|
except KeyboardInterrupt:
|
|
54
61
|
pass
|
dstack/_internal/cli/main.py
CHANGED
|
@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
|
|
|
15
15
|
from dstack._internal.cli.commands.logs import LogsCommand
|
|
16
16
|
from dstack._internal.cli.commands.metrics import MetricsCommand
|
|
17
17
|
from dstack._internal.cli.commands.offer import OfferCommand
|
|
18
|
+
from dstack._internal.cli.commands.project import ProjectCommand
|
|
18
19
|
from dstack._internal.cli.commands.ps import PsCommand
|
|
19
20
|
from dstack._internal.cli.commands.server import ServerCommand
|
|
20
21
|
from dstack._internal.cli.commands.stats import StatsCommand
|
|
@@ -69,6 +70,7 @@ def main():
|
|
|
69
70
|
OfferCommand.register(subparsers)
|
|
70
71
|
LogsCommand.register(subparsers)
|
|
71
72
|
MetricsCommand.register(subparsers)
|
|
73
|
+
ProjectCommand.register(subparsers)
|
|
72
74
|
PsCommand.register(subparsers)
|
|
73
75
|
ServerCommand.register(subparsers)
|
|
74
76
|
StatsCommand.register(subparsers)
|
|
@@ -98,6 +98,8 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
98
98
|
print_run_plan(run_plan, max_offers=configurator_args.max_offers)
|
|
99
99
|
|
|
100
100
|
confirm_message = "Submit a new run?"
|
|
101
|
+
if conf.name:
|
|
102
|
+
confirm_message = f"Submit the run [code]{conf.name}[/]?"
|
|
101
103
|
stop_run_name = None
|
|
102
104
|
if run_plan.current_resource is not None:
|
|
103
105
|
changed_fields = []
|
|
@@ -130,11 +132,6 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
130
132
|
f"Active run [code]{conf.name}[/] already exists and cannot be updated in-place."
|
|
131
133
|
)
|
|
132
134
|
confirm_message = "Stop and override the run?"
|
|
133
|
-
else:
|
|
134
|
-
console.print(f"Finished run [code]{conf.name}[/] already exists.")
|
|
135
|
-
confirm_message = "Override the run?"
|
|
136
|
-
elif conf.name:
|
|
137
|
-
confirm_message = f"Submit the run [code]{conf.name}[/]?"
|
|
138
135
|
|
|
139
136
|
if not command_args.yes and not confirm_ask(confirm_message):
|
|
140
137
|
console.print("\nExiting...")
|
|
@@ -560,7 +557,9 @@ def print_finished_message(run: Run):
|
|
|
560
557
|
console.print("[code]Done[/]")
|
|
561
558
|
return
|
|
562
559
|
|
|
563
|
-
termination_reason, termination_reason_message =
|
|
560
|
+
termination_reason, termination_reason_message, exit_status = (
|
|
561
|
+
_get_run_termination_reason_and_exit_status(run)
|
|
562
|
+
)
|
|
564
563
|
message = "Run failed due to unknown reason. Check CLI, server, and run logs."
|
|
565
564
|
if run.status == RunStatus.TERMINATED:
|
|
566
565
|
message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
|
|
@@ -572,13 +571,15 @@ def print_finished_message(run: Run):
|
|
|
572
571
|
"Check CLI and server logs for more details."
|
|
573
572
|
)
|
|
574
573
|
elif termination_reason is not None:
|
|
574
|
+
exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
|
|
575
575
|
error_details = (
|
|
576
576
|
f"Error: {termination_reason_message}\n" if termination_reason_message else ""
|
|
577
577
|
)
|
|
578
578
|
message = (
|
|
579
579
|
f"Run failed with error code {termination_reason.name}.\n"
|
|
580
|
+
f"{exit_status_details}"
|
|
580
581
|
f"{error_details}"
|
|
581
|
-
"Check
|
|
582
|
+
f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
|
|
582
583
|
)
|
|
583
584
|
console.print(f"[error]{message}[/]")
|
|
584
585
|
|
|
@@ -589,14 +590,20 @@ def get_run_exit_code(run: Run) -> int:
|
|
|
589
590
|
return 1
|
|
590
591
|
|
|
591
592
|
|
|
592
|
-
def
|
|
593
|
+
def _get_run_termination_reason_and_exit_status(
|
|
594
|
+
run: Run,
|
|
595
|
+
) -> Tuple[Optional[JobTerminationReason], Optional[str], Optional[int]]:
|
|
593
596
|
if len(run._run.jobs) == 0:
|
|
594
|
-
return None, None
|
|
597
|
+
return None, None, None
|
|
595
598
|
job = run._run.jobs[0]
|
|
596
599
|
if len(job.job_submissions) == 0:
|
|
597
|
-
return None, None
|
|
600
|
+
return None, None, None
|
|
598
601
|
job_submission = job.job_submissions[0]
|
|
599
|
-
return
|
|
602
|
+
return (
|
|
603
|
+
job_submission.termination_reason,
|
|
604
|
+
job_submission.termination_reason_message,
|
|
605
|
+
job_submission.exit_status,
|
|
606
|
+
)
|
|
600
607
|
|
|
601
608
|
|
|
602
609
|
def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool:
|
|
@@ -218,6 +218,11 @@ def _get_run_error(run: Run) -> str:
|
|
|
218
218
|
|
|
219
219
|
|
|
220
220
|
def _get_job_error(job: Job) -> str:
|
|
221
|
-
|
|
221
|
+
job_submission = job.job_submissions[-1]
|
|
222
|
+
termination_reason = job_submission.termination_reason
|
|
223
|
+
exit_status = job_submission.exit_status
|
|
224
|
+
if termination_reason is None:
|
|
222
225
|
return ""
|
|
223
|
-
|
|
226
|
+
if exit_status:
|
|
227
|
+
return f"{termination_reason.name} {exit_status}"
|
|
228
|
+
return termination_reason.name
|
|
@@ -391,9 +391,12 @@ class VMImageVariant(enum.Enum):
|
|
|
391
391
|
|
|
392
392
|
|
|
393
393
|
_SUPPORTED_VM_SERIES_PATTERNS = [
|
|
394
|
-
|
|
395
|
-
|
|
394
|
+
# TODO: Support newer CPU series (Dsv6, Esv6).
|
|
395
|
+
# They are NVMe-only and require marking the VM image as NVMe.
|
|
396
|
+
r"D(\d+)s_v3", # Dsv3-series (general purpose)
|
|
397
|
+
r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
|
|
396
398
|
r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
|
|
399
|
+
r"F(\d+)s_v2", # Fsv2-series (compute optimized)
|
|
397
400
|
r"NC(\d+)s_v3", # NCv3-series [V100 16GB]
|
|
398
401
|
r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4]
|
|
399
402
|
r"ND(\d+)rs_v2", # NDv2-series [8xV100 32GB]
|
|
@@ -147,7 +147,7 @@ class CudoCompute(
|
|
|
147
147
|
|
|
148
148
|
|
|
149
149
|
def _get_image_id(cuda: bool) -> str:
|
|
150
|
-
image_name = "ubuntu-2204-nvidia-535-docker-
|
|
150
|
+
image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204"
|
|
151
151
|
return image_name
|
|
152
152
|
|
|
153
153
|
|
|
@@ -20,6 +20,7 @@ INFINIBAND_FABRICS = [
|
|
|
20
20
|
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
|
|
21
21
|
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
|
|
22
22
|
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
|
|
23
|
+
InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
|
|
23
24
|
]
|
|
24
25
|
|
|
25
26
|
|
|
@@ -5,7 +5,7 @@ from pydantic import Field, root_validator
|
|
|
5
5
|
from dstack._internal.core.backends.base.models import fill_data
|
|
6
6
|
from dstack._internal.core.models.common import CoreModel
|
|
7
7
|
|
|
8
|
-
DEFAULT_PROJECT_NAME_PREFIX = "default
|
|
8
|
+
DEFAULT_PROJECT_NAME_PREFIX = "default"
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class NebiusServiceAccountCreds(CoreModel):
|
|
@@ -23,6 +23,9 @@ ValidPort = conint(gt=0, le=65536)
|
|
|
23
23
|
MAX_INT64 = 2**63 - 1
|
|
24
24
|
SERVICE_HTTPS_DEFAULT = True
|
|
25
25
|
STRIP_PREFIX_DEFAULT = True
|
|
26
|
+
RUN_PRIOTIRY_MIN = 0
|
|
27
|
+
RUN_PRIOTIRY_MAX = 100
|
|
28
|
+
RUN_PRIORITY_DEFAULT = 0
|
|
26
29
|
|
|
27
30
|
|
|
28
31
|
class RunConfigurationType(str, Enum):
|
|
@@ -77,7 +80,8 @@ class ScalingSpec(CoreModel):
|
|
|
77
80
|
Field(
|
|
78
81
|
description="The target value of the metric. "
|
|
79
82
|
"The number of replicas is calculated based on this number and automatically adjusts "
|
|
80
|
-
"(scales up or down) as this metric changes"
|
|
83
|
+
"(scales up or down) as this metric changes",
|
|
84
|
+
gt=0,
|
|
81
85
|
),
|
|
82
86
|
]
|
|
83
87
|
scale_up_delay: Annotated[
|
|
@@ -221,14 +225,26 @@ class BaseRunConfiguration(CoreModel):
|
|
|
221
225
|
)
|
|
222
226
|
),
|
|
223
227
|
] = None
|
|
224
|
-
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
|
|
225
|
-
setup: CommandsList = []
|
|
226
228
|
resources: Annotated[
|
|
227
229
|
ResourcesSpec, Field(description="The resources requirements to run the configuration")
|
|
228
230
|
] = ResourcesSpec()
|
|
231
|
+
priority: Annotated[
|
|
232
|
+
Optional[int],
|
|
233
|
+
Field(
|
|
234
|
+
ge=RUN_PRIOTIRY_MIN,
|
|
235
|
+
le=RUN_PRIOTIRY_MAX,
|
|
236
|
+
description=(
|
|
237
|
+
f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
|
|
238
|
+
" `dstack` tries to provision runs with higher priority first."
|
|
239
|
+
f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
|
|
240
|
+
),
|
|
241
|
+
),
|
|
242
|
+
] = None
|
|
229
243
|
volumes: Annotated[
|
|
230
244
|
List[Union[MountPoint, str]], Field(description="The volumes mount points")
|
|
231
245
|
] = []
|
|
246
|
+
# deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
|
|
247
|
+
setup: CommandsList = []
|
|
232
248
|
|
|
233
249
|
@validator("python", pre=True, always=True)
|
|
234
250
|
def convert_python(cls, v, values) -> Optional[PythonVersion]:
|
|
@@ -126,7 +126,7 @@ class ComputeCapability(Tuple[int, int]):
|
|
|
126
126
|
|
|
127
127
|
DEFAULT_CPU_COUNT = Range[int](min=2)
|
|
128
128
|
DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
|
|
129
|
-
DEFAULT_GPU_COUNT = Range[int](min=1
|
|
129
|
+
DEFAULT_GPU_COUNT = Range[int](min=1)
|
|
130
130
|
|
|
131
131
|
|
|
132
132
|
class CPUSpec(CoreModel):
|
|
@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
|
|
|
104
104
|
# Set by the server
|
|
105
105
|
FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
|
|
106
106
|
INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
|
|
107
|
+
INSTANCE_UNREACHABLE = "instance_unreachable"
|
|
107
108
|
WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
|
|
108
109
|
WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
|
|
109
110
|
TERMINATED_BY_USER = "terminated_by_user"
|
|
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
|
|
|
126
127
|
mapping = {
|
|
127
128
|
self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
|
|
128
129
|
self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
|
|
130
|
+
self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
|
|
129
131
|
self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
|
|
130
132
|
self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
|
|
131
133
|
self.TERMINATED_BY_USER: JobStatus.TERMINATED,
|
|
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
|
|
|
262
264
|
# or not applicable (container-based backends)
|
|
263
265
|
ports: Optional[dict[int, int]] = None
|
|
264
266
|
# List of volumes used by the job
|
|
265
|
-
volume_names: Optional[list[str]] = None # None for backward
|
|
267
|
+
volume_names: Optional[list[str]] = None # None for backward compatibility
|
|
266
268
|
# Virtual shared offer
|
|
267
|
-
offer: Optional[InstanceOfferWithAvailability] = None # None for backward
|
|
269
|
+
offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility
|
|
268
270
|
|
|
269
271
|
|
|
270
272
|
class ClusterInfo(CoreModel):
|
|
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
|
|
|
283
285
|
status: JobStatus
|
|
284
286
|
termination_reason: Optional[JobTerminationReason]
|
|
285
287
|
termination_reason_message: Optional[str]
|
|
288
|
+
exit_status: Optional[int]
|
|
286
289
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
287
290
|
job_runtime_data: Optional[JobRuntimeData]
|
|
288
291
|
|
|
@@ -508,7 +511,9 @@ def _get_run_error(
|
|
|
508
511
|
return ""
|
|
509
512
|
if len(run_jobs) > 1:
|
|
510
513
|
return run_termination_reason.name
|
|
511
|
-
run_job_termination_reason =
|
|
514
|
+
run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
|
|
515
|
+
run_jobs
|
|
516
|
+
)
|
|
512
517
|
# For failed runs, also show termination reason to provide more context.
|
|
513
518
|
# For other run statuses, the job termination reason will duplicate run status.
|
|
514
519
|
if run_job_termination_reason is not None and run_termination_reason in [
|
|
@@ -516,13 +521,20 @@ def _get_run_error(
|
|
|
516
521
|
RunTerminationReason.SERVER_ERROR,
|
|
517
522
|
RunTerminationReason.RETRY_LIMIT_EXCEEDED,
|
|
518
523
|
]:
|
|
524
|
+
if exit_status:
|
|
525
|
+
return (
|
|
526
|
+
f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
|
|
527
|
+
)
|
|
519
528
|
return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
|
|
520
529
|
return run_termination_reason.name
|
|
521
530
|
|
|
522
531
|
|
|
523
|
-
def
|
|
532
|
+
def _get_run_job_termination_reason_and_exit_status(
|
|
533
|
+
run_jobs: List[Job],
|
|
534
|
+
) -> tuple[Optional[JobTerminationReason], Optional[int]]:
|
|
524
535
|
for job in run_jobs:
|
|
525
536
|
if len(job.job_submissions) > 0:
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
537
|
+
job_submission = job.job_submissions[-1]
|
|
538
|
+
if job_submission.termination_reason is not None:
|
|
539
|
+
return job_submission.termination_reason, job_submission.exit_status
|
|
540
|
+
return None, None
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
|
-
from sqlalchemy import delete, select
|
|
5
|
+
from sqlalchemy import Delete, delete, select
|
|
6
6
|
from sqlalchemy.orm import joinedload
|
|
7
7
|
|
|
8
8
|
from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
|
|
@@ -42,11 +42,36 @@ async def collect_metrics():
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
async def delete_metrics():
|
|
45
|
-
|
|
45
|
+
now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000)
|
|
46
|
+
running_timestamp_micro_cutoff = (
|
|
47
|
+
now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000
|
|
48
|
+
)
|
|
49
|
+
finished_timestamp_micro_cutoff = (
|
|
50
|
+
now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
|
|
51
|
+
)
|
|
52
|
+
await asyncio.gather(
|
|
53
|
+
_execute_delete_statement(
|
|
54
|
+
delete(JobMetricsPoint).where(
|
|
55
|
+
JobMetricsPoint.job_id.in_(
|
|
56
|
+
select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
|
|
57
|
+
),
|
|
58
|
+
JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
|
|
59
|
+
)
|
|
60
|
+
),
|
|
61
|
+
_execute_delete_statement(
|
|
62
|
+
delete(JobMetricsPoint).where(
|
|
63
|
+
JobMetricsPoint.job_id.in_(
|
|
64
|
+
select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
|
|
65
|
+
),
|
|
66
|
+
JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
|
|
67
|
+
)
|
|
68
|
+
),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def _execute_delete_statement(stmt: Delete) -> None:
|
|
46
73
|
async with get_session_ctx() as session:
|
|
47
|
-
await session.execute(
|
|
48
|
-
delete(JobMetricsPoint).where(JobMetricsPoint.timestamp_micro < cutoff)
|
|
49
|
-
)
|
|
74
|
+
await session.execute(stmt)
|
|
50
75
|
await session.commit()
|
|
51
76
|
|
|
52
77
|
|
|
@@ -134,9 +159,3 @@ def _pull_runner_metrics(
|
|
|
134
159
|
) -> Optional[MetricsResponse]:
|
|
135
160
|
runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
|
|
136
161
|
return runner_client.get_metrics()
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def _get_delete_metrics_cutoff() -> int:
|
|
140
|
-
now = int(get_current_datetime().timestamp() * 1_000_000)
|
|
141
|
-
cutoff = now - (settings.SERVER_METRICS_TTL_SECONDS * 1_000_000)
|
|
142
|
-
return cutoff
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from collections.abc import Iterable
|
|
3
|
-
from datetime import timedelta
|
|
3
|
+
from datetime import timedelta, timezone
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
from sqlalchemy import select
|
|
@@ -71,6 +71,12 @@ from dstack._internal.utils.logging import get_logger
|
|
|
71
71
|
logger = get_logger(__name__)
|
|
72
72
|
|
|
73
73
|
|
|
74
|
+
# Minimum time before terminating active job in case of connectivity issues.
|
|
75
|
+
# Should be sufficient to survive most problems caused by
|
|
76
|
+
# the server network flickering and providers' glitches.
|
|
77
|
+
JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
|
|
78
|
+
|
|
79
|
+
|
|
74
80
|
async def process_running_jobs(batch_size: int = 1):
|
|
75
81
|
tasks = []
|
|
76
82
|
for _ in range(batch_size):
|
|
@@ -202,7 +208,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
202
208
|
user_ssh_key = run.run_spec.ssh_key_pub.strip()
|
|
203
209
|
public_keys = [project.ssh_public_key.strip(), user_ssh_key]
|
|
204
210
|
if job_provisioning_data.backend == BackendType.LOCAL:
|
|
205
|
-
# No need to update ~/.ssh/authorized_keys when running shim
|
|
211
|
+
# No need to update ~/.ssh/authorized_keys when running shim locally
|
|
206
212
|
user_ssh_key = ""
|
|
207
213
|
success = await common_utils.run_async(
|
|
208
214
|
_process_provisioning_with_shim,
|
|
@@ -299,19 +305,38 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
|
|
|
299
305
|
run_model,
|
|
300
306
|
job_model,
|
|
301
307
|
)
|
|
302
|
-
if not success:
|
|
303
|
-
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
304
308
|
|
|
305
|
-
if
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
309
|
+
if success:
|
|
310
|
+
job_model.disconnected_at = None
|
|
311
|
+
else:
|
|
312
|
+
if job_model.termination_reason:
|
|
313
|
+
logger.warning(
|
|
314
|
+
"%s: failed because shim/runner returned an error, age=%s",
|
|
315
|
+
fmt(job_model),
|
|
316
|
+
job_submission.age,
|
|
317
|
+
)
|
|
318
|
+
job_model.status = JobStatus.TERMINATING
|
|
319
|
+
# job will be terminated and instance will be emptied by process_terminating_jobs
|
|
320
|
+
else:
|
|
321
|
+
# No job_model.termination_reason set means ssh connection failed
|
|
322
|
+
if job_model.disconnected_at is None:
|
|
323
|
+
job_model.disconnected_at = common_utils.get_current_datetime()
|
|
324
|
+
if _should_terminate_job_due_to_disconnect(job_model):
|
|
325
|
+
logger.warning(
|
|
326
|
+
"%s: failed because instance is unreachable, age=%s",
|
|
327
|
+
fmt(job_model),
|
|
328
|
+
job_submission.age,
|
|
329
|
+
)
|
|
330
|
+
# TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
|
|
331
|
+
# when CLI <= 0.19.8 is no longer supported
|
|
332
|
+
job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
|
|
333
|
+
job_model.status = JobStatus.TERMINATING
|
|
334
|
+
else:
|
|
335
|
+
logger.warning(
|
|
336
|
+
"%s: is unreachable, waiting for the instance to become reachable again, age=%s",
|
|
337
|
+
fmt(job_model),
|
|
338
|
+
job_submission.age,
|
|
339
|
+
)
|
|
315
340
|
|
|
316
341
|
if (
|
|
317
342
|
initial_status != job_model.status
|
|
@@ -543,7 +568,7 @@ def _process_pulling_with_shim(
|
|
|
543
568
|
if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
|
|
544
569
|
task = shim_client.get_task(job_model.id)
|
|
545
570
|
|
|
546
|
-
# If task goes to terminated before the job is submitted to runner, then an error
|
|
571
|
+
# If task goes to terminated before the job is submitted to runner, then an error occurred
|
|
547
572
|
if task.status == TaskStatus.TERMINATED:
|
|
548
573
|
logger.warning(
|
|
549
574
|
"shim failed to execute job %s: %s (%s)",
|
|
@@ -572,7 +597,7 @@ def _process_pulling_with_shim(
|
|
|
572
597
|
else:
|
|
573
598
|
shim_status = shim_client.pull() # raises error if shim is down, causes retry
|
|
574
599
|
|
|
575
|
-
# If shim goes to pending before the job is submitted to runner, then an error
|
|
600
|
+
# If shim goes to pending before the job is submitted to runner, then an error occurred
|
|
576
601
|
if (
|
|
577
602
|
shim_status.state == "pending"
|
|
578
603
|
and shim_status.result is not None
|
|
@@ -651,6 +676,10 @@ def _process_running(
|
|
|
651
676
|
)
|
|
652
677
|
if latest_state_event.termination_message:
|
|
653
678
|
job_model.termination_reason_message = latest_state_event.termination_message
|
|
679
|
+
if (exit_status := latest_state_event.exit_status) is not None:
|
|
680
|
+
job_model.exit_status = exit_status
|
|
681
|
+
if exit_status != 0:
|
|
682
|
+
logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
|
|
654
683
|
else:
|
|
655
684
|
_terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
|
|
656
685
|
if job_model.status != previous_status:
|
|
@@ -688,6 +717,15 @@ def _terminate_if_inactivity_duration_exceeded(
|
|
|
688
717
|
)
|
|
689
718
|
|
|
690
719
|
|
|
720
|
+
def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
|
|
721
|
+
if job_model.disconnected_at is None:
|
|
722
|
+
return False
|
|
723
|
+
return (
|
|
724
|
+
common_utils.get_current_datetime()
|
|
725
|
+
> job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
|
|
691
729
|
async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
|
|
692
730
|
policy = job.job_spec.utilization_policy
|
|
693
731
|
if policy is None:
|
|
@@ -818,8 +856,8 @@ def _submit_job_to_runner(
|
|
|
818
856
|
return success_if_not_available
|
|
819
857
|
|
|
820
858
|
runner_client.submit_job(
|
|
821
|
-
|
|
822
|
-
|
|
859
|
+
run=run,
|
|
860
|
+
job=job,
|
|
823
861
|
cluster_info=cluster_info,
|
|
824
862
|
secrets=secrets,
|
|
825
863
|
repo_credentials=repo_credentials,
|