dstack 0.19.8__py3-none-any.whl → 0.19.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (42) hide show
  1. dstack/_internal/cli/commands/config.py +1 -1
  2. dstack/_internal/cli/commands/project.py +161 -0
  3. dstack/_internal/cli/commands/ps.py +9 -2
  4. dstack/_internal/cli/main.py +2 -0
  5. dstack/_internal/cli/services/configurators/run.py +18 -11
  6. dstack/_internal/cli/utils/run.py +7 -2
  7. dstack/_internal/core/backends/azure/compute.py +5 -2
  8. dstack/_internal/core/backends/cudo/compute.py +1 -1
  9. dstack/_internal/core/backends/nebius/fabrics.py +1 -0
  10. dstack/_internal/core/backends/nebius/models.py +1 -1
  11. dstack/_internal/core/models/configurations.py +19 -3
  12. dstack/_internal/core/models/resources.py +1 -1
  13. dstack/_internal/core/models/runs.py +19 -7
  14. dstack/_internal/server/background/tasks/process_metrics.py +30 -11
  15. dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
  16. dstack/_internal/server/background/tasks/process_submitted_jobs.py +21 -12
  17. dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
  18. dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
  19. dstack/_internal/server/migrations/versions/bca2fdf130bf_add_runmodel_priority.py +34 -0
  20. dstack/_internal/server/models.py +6 -1
  21. dstack/_internal/server/routers/repos.py +8 -4
  22. dstack/_internal/server/schemas/runner.py +41 -8
  23. dstack/_internal/server/services/instances.py +6 -2
  24. dstack/_internal/server/services/jobs/__init__.py +1 -0
  25. dstack/_internal/server/services/jobs/configurators/base.py +3 -3
  26. dstack/_internal/server/services/runner/client.py +7 -4
  27. dstack/_internal/server/services/runs.py +33 -20
  28. dstack/_internal/server/settings.py +21 -1
  29. dstack/_internal/server/statics/index.html +1 -1
  30. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js → main-b4803049eac16aea9a49.js} +4 -4
  31. dstack/_internal/server/statics/{main-b4f65323f5df007e1664.js.map → main-b4803049eac16aea9a49.js.map} +1 -1
  32. dstack/_internal/server/testing/common.py +4 -0
  33. dstack/_internal/server/utils/routers.py +3 -6
  34. dstack/_internal/settings.py +4 -0
  35. dstack/api/_public/runs.py +6 -3
  36. dstack/api/server/_runs.py +6 -0
  37. dstack/version.py +1 -1
  38. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/METADATA +46 -34
  39. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/RECORD +42 -38
  40. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/WHEEL +0 -0
  41. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/entry_points.txt +0 -0
  42. {dstack-0.19.8.dist-info → dstack-0.19.10.dist-info}/licenses/LICENSE.md +0 -0
@@ -14,7 +14,7 @@ logger = get_logger(__name__)
14
14
 
15
15
  class ConfigCommand(BaseCommand):
16
16
  NAME = "config"
17
- DESCRIPTION = "Configure CLI"
17
+ DESCRIPTION = "Configure CLI (deprecated; use `dstack project`)"
18
18
 
19
19
  def _register(self):
20
20
  super()._register()
@@ -0,0 +1,161 @@
1
+ import argparse
2
+
3
+ from requests import HTTPError
4
+ from rich.table import Table
5
+
6
+ import dstack.api.server
7
+ from dstack._internal.cli.commands import BaseCommand
8
+ from dstack._internal.cli.utils.common import confirm_ask, console
9
+ from dstack._internal.core.errors import ClientError, CLIError
10
+ from dstack._internal.core.services.configs import ConfigManager
11
+ from dstack._internal.utils.logging import get_logger
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ class ProjectCommand(BaseCommand):
17
+ NAME = "project"
18
+ DESCRIPTION = "Manage projects configs"
19
+
20
+ def _register(self):
21
+ super()._register()
22
+ subparsers = self._parser.add_subparsers(dest="subcommand", help="Command to execute")
23
+
24
+ # Add subcommand
25
+ add_parser = subparsers.add_parser("add", help="Add or update a project config")
26
+ add_parser.add_argument(
27
+ "--name", type=str, help="The name of the project to configure", required=True
28
+ )
29
+ add_parser.add_argument("--url", type=str, help="Server url", required=True)
30
+ add_parser.add_argument("--token", type=str, help="User token", required=True)
31
+ add_parser.add_argument(
32
+ "-y",
33
+ "--yes",
34
+ help="Don't ask for confirmation (e.g. update the config)",
35
+ action="store_true",
36
+ )
37
+ add_parser.add_argument(
38
+ "-n",
39
+ "--no",
40
+ help="Don't ask for confirmation (e.g. do not update the config)",
41
+ action="store_true",
42
+ )
43
+ add_parser.set_defaults(subfunc=self._add)
44
+
45
+ # Delete subcommand
46
+ delete_parser = subparsers.add_parser("delete", help="Delete a project config")
47
+ delete_parser.add_argument(
48
+ "--name", type=str, help="The name of the project to delete", required=True
49
+ )
50
+ delete_parser.add_argument(
51
+ "-y",
52
+ "--yes",
53
+ help="Don't ask for confirmation",
54
+ action="store_true",
55
+ )
56
+ delete_parser.set_defaults(subfunc=self._delete)
57
+
58
+ # List subcommand
59
+ list_parser = subparsers.add_parser("list", help="List configured projects")
60
+ list_parser.set_defaults(subfunc=self._list)
61
+
62
+ # Set default subcommand
63
+ set_default_parser = subparsers.add_parser("set-default", help="Set default project")
64
+ set_default_parser.add_argument(
65
+ "name", type=str, help="The name of the project to set as default"
66
+ )
67
+ set_default_parser.set_defaults(subfunc=self._set_default)
68
+
69
+ def _command(self, args: argparse.Namespace):
70
+ if not hasattr(args, "subfunc"):
71
+ args.subfunc = self._list
72
+ args.subfunc(args)
73
+
74
+ def _add(self, args: argparse.Namespace):
75
+ config_manager = ConfigManager()
76
+ api_client = dstack.api.server.APIClient(base_url=args.url, token=args.token)
77
+ try:
78
+ api_client.projects.get(args.name)
79
+ except HTTPError as e:
80
+ if e.response.status_code == 403:
81
+ raise CLIError("Forbidden. Ensure the token is valid.")
82
+ elif e.response.status_code == 404:
83
+ raise CLIError(f"Project '{args.name}' not found.")
84
+ else:
85
+ raise e
86
+ default_project = config_manager.get_project_config()
87
+ if (
88
+ default_project is None
89
+ or default_project.name != args.name
90
+ or default_project.url != args.url
91
+ or default_project.token != args.token
92
+ ):
93
+ set_it_as_default = (
94
+ (
95
+ args.yes
96
+ or not default_project
97
+ or confirm_ask(f"Set '{args.name}' as your default project?")
98
+ )
99
+ if not args.no
100
+ else False
101
+ )
102
+ config_manager.configure_project(
103
+ name=args.name, url=args.url, token=args.token, default=set_it_as_default
104
+ )
105
+ config_manager.save()
106
+ logger.info(
107
+ f"Configuration updated at {config_manager.config_filepath}", {"show_path": False}
108
+ )
109
+
110
+ def _delete(self, args: argparse.Namespace):
111
+ config_manager = ConfigManager()
112
+ if args.yes or confirm_ask(f"Are you sure you want to delete project '{args.name}'?"):
113
+ config_manager.delete_project(args.name)
114
+ config_manager.save()
115
+ console.print("[grey58]OK[/]")
116
+
117
+ def _list(self, args: argparse.Namespace):
118
+ config_manager = ConfigManager()
119
+ default_project = config_manager.get_project_config()
120
+
121
+ table = Table(box=None)
122
+ table.add_column("PROJECT", style="bold", no_wrap=True)
123
+ table.add_column("URL", style="grey58")
124
+ table.add_column("USER", style="grey58")
125
+ table.add_column("DEFAULT", justify="center")
126
+
127
+ for project_name in config_manager.list_projects():
128
+ project_config = config_manager.get_project_config(project_name)
129
+ is_default = project_name == default_project.name if default_project else False
130
+
131
+ # Get username from API
132
+ try:
133
+ api_client = dstack.api.server.APIClient(
134
+ base_url=project_config.url, token=project_config.token
135
+ )
136
+ user_info = api_client.users.get_my_user()
137
+ username = user_info.username
138
+ except ClientError:
139
+ username = "(invalid token)"
140
+
141
+ table.add_row(
142
+ project_name,
143
+ project_config.url,
144
+ username,
145
+ "✓" if is_default else "",
146
+ style="bold" if is_default else None,
147
+ )
148
+
149
+ console.print(table)
150
+
151
+ def _set_default(self, args: argparse.Namespace):
152
+ config_manager = ConfigManager()
153
+ project_config = config_manager.get_project_config(args.name)
154
+ if project_config is None:
155
+ raise CLIError(f"Project '{args.name}' not found")
156
+
157
+ config_manager.configure_project(
158
+ name=args.name, url=project_config.url, token=project_config.token, default=True
159
+ )
160
+ config_manager.save()
161
+ console.print("[grey58]OK[/]")
@@ -36,10 +36,17 @@ class PsCommand(APIBaseCommand):
36
36
  help="Watch statuses of runs in realtime",
37
37
  action="store_true",
38
38
  )
39
+ self._parser.add_argument(
40
+ "-n",
41
+ "--last",
42
+ help="Show only the last N runs. Implies --all",
43
+ type=int,
44
+ default=None,
45
+ )
39
46
 
40
47
  def _command(self, args: argparse.Namespace):
41
48
  super()._command(args)
42
- runs = self.api.runs.list(all=args.all)
49
+ runs = self.api.runs.list(all=args.all, limit=args.last)
43
50
  if not args.watch:
44
51
  console.print(run_utils.get_runs_table(runs, verbose=args.verbose))
45
52
  return
@@ -49,6 +56,6 @@ class PsCommand(APIBaseCommand):
49
56
  while True:
50
57
  live.update(run_utils.get_runs_table(runs, verbose=args.verbose))
51
58
  time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
52
- runs = self.api.runs.list(all=args.all)
59
+ runs = self.api.runs.list(all=args.all, limit=args.last)
53
60
  except KeyboardInterrupt:
54
61
  pass
@@ -15,6 +15,7 @@ from dstack._internal.cli.commands.init import InitCommand
15
15
  from dstack._internal.cli.commands.logs import LogsCommand
16
16
  from dstack._internal.cli.commands.metrics import MetricsCommand
17
17
  from dstack._internal.cli.commands.offer import OfferCommand
18
+ from dstack._internal.cli.commands.project import ProjectCommand
18
19
  from dstack._internal.cli.commands.ps import PsCommand
19
20
  from dstack._internal.cli.commands.server import ServerCommand
20
21
  from dstack._internal.cli.commands.stats import StatsCommand
@@ -69,6 +70,7 @@ def main():
69
70
  OfferCommand.register(subparsers)
70
71
  LogsCommand.register(subparsers)
71
72
  MetricsCommand.register(subparsers)
73
+ ProjectCommand.register(subparsers)
72
74
  PsCommand.register(subparsers)
73
75
  ServerCommand.register(subparsers)
74
76
  StatsCommand.register(subparsers)
@@ -98,6 +98,8 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
98
98
  print_run_plan(run_plan, max_offers=configurator_args.max_offers)
99
99
 
100
100
  confirm_message = "Submit a new run?"
101
+ if conf.name:
102
+ confirm_message = f"Submit the run [code]{conf.name}[/]?"
101
103
  stop_run_name = None
102
104
  if run_plan.current_resource is not None:
103
105
  changed_fields = []
@@ -130,11 +132,6 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
130
132
  f"Active run [code]{conf.name}[/] already exists and cannot be updated in-place."
131
133
  )
132
134
  confirm_message = "Stop and override the run?"
133
- else:
134
- console.print(f"Finished run [code]{conf.name}[/] already exists.")
135
- confirm_message = "Override the run?"
136
- elif conf.name:
137
- confirm_message = f"Submit the run [code]{conf.name}[/]?"
138
135
 
139
136
  if not command_args.yes and not confirm_ask(confirm_message):
140
137
  console.print("\nExiting...")
@@ -560,7 +557,9 @@ def print_finished_message(run: Run):
560
557
  console.print("[code]Done[/]")
561
558
  return
562
559
 
563
- termination_reason, termination_reason_message = _get_run_termination_reason(run)
560
+ termination_reason, termination_reason_message, exit_status = (
561
+ _get_run_termination_reason_and_exit_status(run)
562
+ )
564
563
  message = "Run failed due to unknown reason. Check CLI, server, and run logs."
565
564
  if run.status == RunStatus.TERMINATED:
566
565
  message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
@@ -572,13 +571,15 @@ def print_finished_message(run: Run):
572
571
  "Check CLI and server logs for more details."
573
572
  )
574
573
  elif termination_reason is not None:
574
+ exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
575
575
  error_details = (
576
576
  f"Error: {termination_reason_message}\n" if termination_reason_message else ""
577
577
  )
578
578
  message = (
579
579
  f"Run failed with error code {termination_reason.name}.\n"
580
+ f"{exit_status_details}"
580
581
  f"{error_details}"
581
- "Check CLI, server, and run logs for more details."
582
+ f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
582
583
  )
583
584
  console.print(f"[error]{message}[/]")
584
585
 
@@ -589,14 +590,20 @@ def get_run_exit_code(run: Run) -> int:
589
590
  return 1
590
591
 
591
592
 
592
- def _get_run_termination_reason(run: Run) -> Tuple[Optional[JobTerminationReason], Optional[str]]:
593
+ def _get_run_termination_reason_and_exit_status(
594
+ run: Run,
595
+ ) -> Tuple[Optional[JobTerminationReason], Optional[str], Optional[int]]:
593
596
  if len(run._run.jobs) == 0:
594
- return None, None
597
+ return None, None, None
595
598
  job = run._run.jobs[0]
596
599
  if len(job.job_submissions) == 0:
597
- return None, None
600
+ return None, None, None
598
601
  job_submission = job.job_submissions[0]
599
- return job_submission.termination_reason, job_submission.termination_reason_message
602
+ return (
603
+ job_submission.termination_reason,
604
+ job_submission.termination_reason_message,
605
+ job_submission.exit_status,
606
+ )
600
607
 
601
608
 
602
609
  def _run_resubmitted(run: Run, current_job_submission: Optional[JobSubmission]) -> bool:
@@ -218,6 +218,11 @@ def _get_run_error(run: Run) -> str:
218
218
 
219
219
 
220
220
  def _get_job_error(job: Job) -> str:
221
- if job.job_submissions[-1].termination_reason is None:
221
+ job_submission = job.job_submissions[-1]
222
+ termination_reason = job_submission.termination_reason
223
+ exit_status = job_submission.exit_status
224
+ if termination_reason is None:
222
225
  return ""
223
- return job.job_submissions[-1].termination_reason.name
226
+ if exit_status:
227
+ return f"{termination_reason.name} {exit_status}"
228
+ return termination_reason.name
@@ -391,9 +391,12 @@ class VMImageVariant(enum.Enum):
391
391
 
392
392
 
393
393
  _SUPPORTED_VM_SERIES_PATTERNS = [
394
- r"D(\d+)s_v3", # Dsv3-series
395
- r"E(\d+)i?s_v4", # Esv4-series
394
+ # TODO: Support newer CPU series (Dsv6, Esv6).
395
+ # They are NVMe-only and require marking the VM image as NVMe.
396
+ r"D(\d+)s_v3", # Dsv3-series (general purpose)
397
+ r"E(\d+)i?s_v4", # Esv4-series (memory optimized)
396
398
  r"E(\d+)-(\d+)s_v4", # Esv4-series (constrained vCPU)
399
+ r"F(\d+)s_v2", # Fsv2-series (compute optimized)
397
400
  r"NC(\d+)s_v3", # NCv3-series [V100 16GB]
398
401
  r"NC(\d+)as_T4_v3", # NCasT4_v3-series [T4]
399
402
  r"ND(\d+)rs_v2", # NDv2-series [8xV100 32GB]
@@ -147,7 +147,7 @@ class CudoCompute(
147
147
 
148
148
 
149
149
  def _get_image_id(cuda: bool) -> str:
150
- image_name = "ubuntu-2204-nvidia-535-docker-v20240214" if cuda else "ubuntu-2204"
150
+ image_name = "ubuntu-2204-nvidia-535-docker-v20241017" if cuda else "ubuntu-2204"
151
151
  return image_name
152
152
 
153
153
 
@@ -20,6 +20,7 @@ INFINIBAND_FABRICS = [
20
20
  InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
21
21
  InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22
22
  InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23
+ InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
23
24
  ]
24
25
 
25
26
 
@@ -5,7 +5,7 @@ from pydantic import Field, root_validator
5
5
  from dstack._internal.core.backends.base.models import fill_data
6
6
  from dstack._internal.core.models.common import CoreModel
7
7
 
8
- DEFAULT_PROJECT_NAME_PREFIX = "default-project"
8
+ DEFAULT_PROJECT_NAME_PREFIX = "default"
9
9
 
10
10
 
11
11
  class NebiusServiceAccountCreds(CoreModel):
@@ -23,6 +23,9 @@ ValidPort = conint(gt=0, le=65536)
23
23
  MAX_INT64 = 2**63 - 1
24
24
  SERVICE_HTTPS_DEFAULT = True
25
25
  STRIP_PREFIX_DEFAULT = True
26
+ RUN_PRIOTIRY_MIN = 0
27
+ RUN_PRIOTIRY_MAX = 100
28
+ RUN_PRIORITY_DEFAULT = 0
26
29
 
27
30
 
28
31
  class RunConfigurationType(str, Enum):
@@ -77,7 +80,8 @@ class ScalingSpec(CoreModel):
77
80
  Field(
78
81
  description="The target value of the metric. "
79
82
  "The number of replicas is calculated based on this number and automatically adjusts "
80
- "(scales up or down) as this metric changes"
83
+ "(scales up or down) as this metric changes",
84
+ gt=0,
81
85
  ),
82
86
  ]
83
87
  scale_up_delay: Annotated[
@@ -221,14 +225,26 @@ class BaseRunConfiguration(CoreModel):
221
225
  )
222
226
  ),
223
227
  ] = None
224
- # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
225
- setup: CommandsList = []
226
228
  resources: Annotated[
227
229
  ResourcesSpec, Field(description="The resources requirements to run the configuration")
228
230
  ] = ResourcesSpec()
231
+ priority: Annotated[
232
+ Optional[int],
233
+ Field(
234
+ ge=RUN_PRIOTIRY_MIN,
235
+ le=RUN_PRIOTIRY_MAX,
236
+ description=(
237
+ f"The priority of the run, an integer between `{RUN_PRIOTIRY_MIN}` and `{RUN_PRIOTIRY_MAX}`."
238
+ " `dstack` tries to provision runs with higher priority first."
239
+ f" Defaults to `{RUN_PRIORITY_DEFAULT}`"
240
+ ),
241
+ ),
242
+ ] = None
229
243
  volumes: Annotated[
230
244
  List[Union[MountPoint, str]], Field(description="The volumes mount points")
231
245
  ] = []
246
+ # deprecated since 0.18.31; task, service -- no effect; dev-environment -- executed right before `init`
247
+ setup: CommandsList = []
232
248
 
233
249
  @validator("python", pre=True, always=True)
234
250
  def convert_python(cls, v, values) -> Optional[PythonVersion]:
@@ -126,7 +126,7 @@ class ComputeCapability(Tuple[int, int]):
126
126
 
127
127
  DEFAULT_CPU_COUNT = Range[int](min=2)
128
128
  DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
129
- DEFAULT_GPU_COUNT = Range[int](min=1, max=1)
129
+ DEFAULT_GPU_COUNT = Range[int](min=1)
130
130
 
131
131
 
132
132
  class CPUSpec(CoreModel):
@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
104
104
  # Set by the server
105
105
  FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
106
106
  INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
107
+ INSTANCE_UNREACHABLE = "instance_unreachable"
107
108
  WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
108
109
  WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
109
110
  TERMINATED_BY_USER = "terminated_by_user"
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
126
127
  mapping = {
127
128
  self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
128
129
  self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
130
+ self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
129
131
  self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
130
132
  self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
131
133
  self.TERMINATED_BY_USER: JobStatus.TERMINATED,
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
262
264
  # or not applicable (container-based backends)
263
265
  ports: Optional[dict[int, int]] = None
264
266
  # List of volumes used by the job
265
- volume_names: Optional[list[str]] = None # None for backward compalibility
267
+ volume_names: Optional[list[str]] = None # None for backward compatibility
266
268
  # Virtual shared offer
267
- offer: Optional[InstanceOfferWithAvailability] = None # None for backward compalibility
269
+ offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility
268
270
 
269
271
 
270
272
  class ClusterInfo(CoreModel):
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
283
285
  status: JobStatus
284
286
  termination_reason: Optional[JobTerminationReason]
285
287
  termination_reason_message: Optional[str]
288
+ exit_status: Optional[int]
286
289
  job_provisioning_data: Optional[JobProvisioningData]
287
290
  job_runtime_data: Optional[JobRuntimeData]
288
291
 
@@ -508,7 +511,9 @@ def _get_run_error(
508
511
  return ""
509
512
  if len(run_jobs) > 1:
510
513
  return run_termination_reason.name
511
- run_job_termination_reason = _get_run_job_termination_reason(run_jobs)
514
+ run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
515
+ run_jobs
516
+ )
512
517
  # For failed runs, also show termination reason to provide more context.
513
518
  # For other run statuses, the job termination reason will duplicate run status.
514
519
  if run_job_termination_reason is not None and run_termination_reason in [
@@ -516,13 +521,20 @@ def _get_run_error(
516
521
  RunTerminationReason.SERVER_ERROR,
517
522
  RunTerminationReason.RETRY_LIMIT_EXCEEDED,
518
523
  ]:
524
+ if exit_status:
525
+ return (
526
+ f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
527
+ )
519
528
  return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
520
529
  return run_termination_reason.name
521
530
 
522
531
 
523
- def _get_run_job_termination_reason(run_jobs: List[Job]) -> Optional[JobTerminationReason]:
532
+ def _get_run_job_termination_reason_and_exit_status(
533
+ run_jobs: List[Job],
534
+ ) -> tuple[Optional[JobTerminationReason], Optional[int]]:
524
535
  for job in run_jobs:
525
536
  if len(job.job_submissions) > 0:
526
- if job.job_submissions[-1].termination_reason is not None:
527
- return job.job_submissions[-1].termination_reason
528
- return None
537
+ job_submission = job.job_submissions[-1]
538
+ if job_submission.termination_reason is not None:
539
+ return job_submission.termination_reason, job_submission.exit_status
540
+ return None, None
@@ -2,7 +2,7 @@ import asyncio
2
2
  import json
3
3
  from typing import Dict, List, Optional
4
4
 
5
- from sqlalchemy import delete, select
5
+ from sqlalchemy import Delete, delete, select
6
6
  from sqlalchemy.orm import joinedload
7
7
 
8
8
  from dstack._internal.core.consts import DSTACK_RUNNER_HTTP_PORT
@@ -42,11 +42,36 @@ async def collect_metrics():
42
42
 
43
43
 
44
44
  async def delete_metrics():
45
- cutoff = _get_delete_metrics_cutoff()
45
+ now_timestamp_micro = int(get_current_datetime().timestamp() * 1_000_000)
46
+ running_timestamp_micro_cutoff = (
47
+ now_timestamp_micro - settings.SERVER_METRICS_RUNNING_TTL_SECONDS * 1_000_000
48
+ )
49
+ finished_timestamp_micro_cutoff = (
50
+ now_timestamp_micro - settings.SERVER_METRICS_FINISHED_TTL_SECONDS * 1_000_000
51
+ )
52
+ await asyncio.gather(
53
+ _execute_delete_statement(
54
+ delete(JobMetricsPoint).where(
55
+ JobMetricsPoint.job_id.in_(
56
+ select(JobModel.id).where(JobModel.status.in_([JobStatus.RUNNING]))
57
+ ),
58
+ JobMetricsPoint.timestamp_micro < running_timestamp_micro_cutoff,
59
+ )
60
+ ),
61
+ _execute_delete_statement(
62
+ delete(JobMetricsPoint).where(
63
+ JobMetricsPoint.job_id.in_(
64
+ select(JobModel.id).where(JobModel.status.in_(JobStatus.finished_statuses()))
65
+ ),
66
+ JobMetricsPoint.timestamp_micro < finished_timestamp_micro_cutoff,
67
+ )
68
+ ),
69
+ )
70
+
71
+
72
+ async def _execute_delete_statement(stmt: Delete) -> None:
46
73
  async with get_session_ctx() as session:
47
- await session.execute(
48
- delete(JobMetricsPoint).where(JobMetricsPoint.timestamp_micro < cutoff)
49
- )
74
+ await session.execute(stmt)
50
75
  await session.commit()
51
76
 
52
77
 
@@ -134,9 +159,3 @@ def _pull_runner_metrics(
134
159
  ) -> Optional[MetricsResponse]:
135
160
  runner_client = client.RunnerClient(port=ports[DSTACK_RUNNER_HTTP_PORT])
136
161
  return runner_client.get_metrics()
137
-
138
-
139
- def _get_delete_metrics_cutoff() -> int:
140
- now = int(get_current_datetime().timestamp() * 1_000_000)
141
- cutoff = now - (settings.SERVER_METRICS_TTL_SECONDS * 1_000_000)
142
- return cutoff
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  from collections.abc import Iterable
3
- from datetime import timedelta
3
+ from datetime import timedelta, timezone
4
4
  from typing import Dict, List, Optional
5
5
 
6
6
  from sqlalchemy import select
@@ -71,6 +71,12 @@ from dstack._internal.utils.logging import get_logger
71
71
  logger = get_logger(__name__)
72
72
 
73
73
 
74
+ # Minimum time before terminating active job in case of connectivity issues.
75
+ # Should be sufficient to survive most problems caused by
76
+ # the server network flickering and providers' glitches.
77
+ JOB_DISCONNECTED_RETRY_TIMEOUT = timedelta(minutes=2)
78
+
79
+
74
80
  async def process_running_jobs(batch_size: int = 1):
75
81
  tasks = []
76
82
  for _ in range(batch_size):
@@ -202,7 +208,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
202
208
  user_ssh_key = run.run_spec.ssh_key_pub.strip()
203
209
  public_keys = [project.ssh_public_key.strip(), user_ssh_key]
204
210
  if job_provisioning_data.backend == BackendType.LOCAL:
205
- # No need to update ~/.ssh/authorized_keys when running shim localy
211
+ # No need to update ~/.ssh/authorized_keys when running shim locally
206
212
  user_ssh_key = ""
207
213
  success = await common_utils.run_async(
208
214
  _process_provisioning_with_shim,
@@ -299,19 +305,38 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
299
305
  run_model,
300
306
  job_model,
301
307
  )
302
- if not success:
303
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
304
308
 
305
- if not success: # kill the job
306
- logger.warning(
307
- "%s: failed because runner is not available or return an error, age=%s",
308
- fmt(job_model),
309
- job_submission.age,
310
- )
311
- job_model.status = JobStatus.TERMINATING
312
- if not job_model.termination_reason:
313
- job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
314
- # job will be terminated and instance will be emptied by process_terminating_jobs
309
+ if success:
310
+ job_model.disconnected_at = None
311
+ else:
312
+ if job_model.termination_reason:
313
+ logger.warning(
314
+ "%s: failed because shim/runner returned an error, age=%s",
315
+ fmt(job_model),
316
+ job_submission.age,
317
+ )
318
+ job_model.status = JobStatus.TERMINATING
319
+ # job will be terminated and instance will be emptied by process_terminating_jobs
320
+ else:
321
+ # No job_model.termination_reason set means ssh connection failed
322
+ if job_model.disconnected_at is None:
323
+ job_model.disconnected_at = common_utils.get_current_datetime()
324
+ if _should_terminate_job_due_to_disconnect(job_model):
325
+ logger.warning(
326
+ "%s: failed because instance is unreachable, age=%s",
327
+ fmt(job_model),
328
+ job_submission.age,
329
+ )
330
+ # TODO: Replace with JobTerminationReason.INSTANCE_UNREACHABLE in 0.20 or
331
+ # when CLI <= 0.19.8 is no longer supported
332
+ job_model.termination_reason = JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY
333
+ job_model.status = JobStatus.TERMINATING
334
+ else:
335
+ logger.warning(
336
+ "%s: is unreachable, waiting for the instance to become reachable again, age=%s",
337
+ fmt(job_model),
338
+ job_submission.age,
339
+ )
315
340
 
316
341
  if (
317
342
  initial_status != job_model.status
@@ -543,7 +568,7 @@ def _process_pulling_with_shim(
543
568
  if shim_client.is_api_v2_supported(): # raises error if shim is down, causes retry
544
569
  task = shim_client.get_task(job_model.id)
545
570
 
546
- # If task goes to terminated before the job is submitted to runner, then an error occured
571
+ # If task goes to terminated before the job is submitted to runner, then an error occurred
547
572
  if task.status == TaskStatus.TERMINATED:
548
573
  logger.warning(
549
574
  "shim failed to execute job %s: %s (%s)",
@@ -572,7 +597,7 @@ def _process_pulling_with_shim(
572
597
  else:
573
598
  shim_status = shim_client.pull() # raises error if shim is down, causes retry
574
599
 
575
- # If shim goes to pending before the job is submitted to runner, then an error occured
600
+ # If shim goes to pending before the job is submitted to runner, then an error occurred
576
601
  if (
577
602
  shim_status.state == "pending"
578
603
  and shim_status.result is not None
@@ -651,6 +676,10 @@ def _process_running(
651
676
  )
652
677
  if latest_state_event.termination_message:
653
678
  job_model.termination_reason_message = latest_state_event.termination_message
679
+ if (exit_status := latest_state_event.exit_status) is not None:
680
+ job_model.exit_status = exit_status
681
+ if exit_status != 0:
682
+ logger.info("%s: non-zero exit status %s", fmt(job_model), exit_status)
654
683
  else:
655
684
  _terminate_if_inactivity_duration_exceeded(run_model, job_model, resp.no_connections_secs)
656
685
  if job_model.status != previous_status:
@@ -688,6 +717,15 @@ def _terminate_if_inactivity_duration_exceeded(
688
717
  )
689
718
 
690
719
 
720
+ def _should_terminate_job_due_to_disconnect(job_model: JobModel) -> bool:
721
+ if job_model.disconnected_at is None:
722
+ return False
723
+ return (
724
+ common_utils.get_current_datetime()
725
+ > job_model.disconnected_at.replace(tzinfo=timezone.utc) + JOB_DISCONNECTED_RETRY_TIMEOUT
726
+ )
727
+
728
+
691
729
  async def _check_gpu_utilization(session: AsyncSession, job_model: JobModel, job: Job) -> None:
692
730
  policy = job.job_spec.utilization_policy
693
731
  if policy is None:
@@ -818,8 +856,8 @@ def _submit_job_to_runner(
818
856
  return success_if_not_available
819
857
 
820
858
  runner_client.submit_job(
821
- run_spec=run.run_spec,
822
- job_spec=job.job_spec,
859
+ run=run,
860
+ job=job,
823
861
  cluster_info=cluster_info,
824
862
  secrets=secrets,
825
863
  repo_credentials=repo_credentials,