dstack 0.19.11rc2__py3-none-any.whl → 0.19.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (32) hide show
  1. dstack/_internal/cli/commands/offer.py +2 -0
  2. dstack/_internal/cli/services/configurators/run.py +43 -42
  3. dstack/_internal/cli/utils/run.py +10 -26
  4. dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
  5. dstack/_internal/core/backends/template/models.py.jinja +4 -0
  6. dstack/_internal/core/models/configurations.py +1 -1
  7. dstack/_internal/core/models/fleets.py +6 -1
  8. dstack/_internal/core/models/profiles.py +43 -3
  9. dstack/_internal/core/models/repos/local.py +19 -13
  10. dstack/_internal/core/models/runs.py +78 -45
  11. dstack/_internal/server/background/tasks/process_running_jobs.py +47 -12
  12. dstack/_internal/server/background/tasks/process_runs.py +14 -1
  13. dstack/_internal/server/services/fleets.py +2 -2
  14. dstack/_internal/server/services/gateways/__init__.py +1 -1
  15. dstack/_internal/server/services/plugins.py +3 -2
  16. dstack/_internal/server/services/runner/client.py +4 -1
  17. dstack/_internal/server/services/runs.py +2 -2
  18. dstack/_internal/server/services/volumes.py +1 -1
  19. dstack/_internal/server/statics/index.html +1 -1
  20. dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js → main-b0e80f8e26a168c129e9.js} +72 -25
  21. dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js.map → main-b0e80f8e26a168c129e9.js.map} +1 -1
  22. dstack/_internal/server/testing/common.py +2 -1
  23. dstack/_internal/utils/common.py +4 -0
  24. dstack/api/server/_fleets.py +5 -1
  25. dstack/api/server/_runs.py +8 -0
  26. dstack/version.py +1 -1
  27. {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/METADATA +2 -1
  28. {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/RECORD +31 -32
  29. dstack/_internal/utils/ignore.py +0 -92
  30. {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/WHEEL +0 -0
  31. {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/entry_points.txt +0 -0
  32. {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/licenses/LICENSE.md +0 -0
@@ -84,6 +84,8 @@ class OfferCommand(APIBaseCommand):
84
84
  job_plan = run_plan.job_plans[0]
85
85
 
86
86
  if args.format == "json":
87
+ # FIXME: Should use effective_run_spec from run_plan,
88
+ # since the spec can be changed by the server and plugins
87
89
  output = {
88
90
  "project": run_plan.project_name,
89
91
  "user": run_plan.user,
@@ -3,7 +3,7 @@ import subprocess
3
3
  import sys
4
4
  import time
5
5
  from pathlib import Path
6
- from typing import Dict, List, Optional, Set, Tuple
6
+ from typing import Dict, List, Optional, Set
7
7
 
8
8
  import gpuhunt
9
9
  from pydantic import parse_obj_as
@@ -41,7 +41,7 @@ from dstack._internal.core.models.configurations import (
41
41
  )
42
42
  from dstack._internal.core.models.repos.base import Repo
43
43
  from dstack._internal.core.models.resources import CPUSpec
44
- from dstack._internal.core.models.runs import JobSubmission, JobTerminationReason, RunStatus
44
+ from dstack._internal.core.models.runs import JobStatus, JobSubmission, RunStatus
45
45
  from dstack._internal.core.services.configs import ConfigManager
46
46
  from dstack._internal.core.services.diff import diff_models
47
47
  from dstack._internal.utils.common import local_time
@@ -105,7 +105,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
105
105
  changed_fields = []
106
106
  if run_plan.action == ApplyAction.UPDATE:
107
107
  diff = diff_models(
108
- run_plan.run_spec.configuration,
108
+ run_plan.get_effective_run_spec().configuration,
109
109
  run_plan.current_resource.run_spec.configuration,
110
110
  )
111
111
  changed_fields = list(diff.keys())
@@ -553,35 +553,38 @@ def _print_service_urls(run: Run) -> None:
553
553
 
554
554
 
555
555
  def print_finished_message(run: Run):
556
+ status_message = (
557
+ run._run.latest_job_submission.status_message
558
+ if run._run.latest_job_submission
559
+ else run._run.status_message
560
+ )
561
+ error = (
562
+ run._run.latest_job_submission.error if run._run.latest_job_submission else run._run.error
563
+ )
564
+ termination_reason = (
565
+ run._run.latest_job_submission.termination_reason
566
+ if run._run.latest_job_submission
567
+ else None
568
+ )
569
+ termination_reason_message = (
570
+ run._run.latest_job_submission.termination_reason_message
571
+ if run._run.latest_job_submission
572
+ else None
573
+ )
556
574
  if run.status == RunStatus.DONE:
557
- console.print("[code]Done[/]")
575
+ console.print(f"[code]{status_message.capitalize()}[/code]")
558
576
  return
577
+ else:
578
+ str = f"[error]{status_message.capitalize()}[/error]"
579
+ if error:
580
+ str += f" ([error]{error.capitalize()}[/error])"
581
+ console.print(str)
559
582
 
560
- termination_reason, termination_reason_message, exit_status = (
561
- _get_run_termination_reason_and_exit_status(run)
562
- )
563
- message = "Run failed due to unknown reason. Check CLI, server, and run logs."
564
- if run.status == RunStatus.TERMINATED:
565
- message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
566
-
567
- if termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
568
- message = (
569
- "All provisioning attempts failed. "
570
- "This is likely due to cloud providers not having enough capacity. "
571
- "Check CLI and server logs for more details."
572
- )
573
- elif termination_reason is not None:
574
- exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
575
- error_details = (
576
- f"Error: {termination_reason_message}\n" if termination_reason_message else ""
577
- )
578
- message = (
579
- f"Run failed with error code {termination_reason.name}.\n"
580
- f"{exit_status_details}"
581
- f"{error_details}"
582
- f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
583
- )
584
- console.print(f"[error]{message}[/]")
583
+ if termination_reason_message:
584
+ console.print(f"[error]{termination_reason_message}[/error]")
585
+
586
+ if termination_reason:
587
+ console.print(f"Check [code]dstack logs -d {run.name}[/code] for more details.")
585
588
 
586
589
 
587
590
  def get_run_exit_code(run: Run) -> int:
@@ -590,19 +593,17 @@ def get_run_exit_code(run: Run) -> int:
590
593
  return 1
591
594
 
592
595
 
593
- def _get_run_termination_reason_and_exit_status(
594
- run: Run,
595
- ) -> Tuple[Optional[JobTerminationReason], Optional[str], Optional[int]]:
596
- if len(run._run.jobs) == 0:
597
- return None, None, None
598
- job = run._run.jobs[0]
599
- if len(job.job_submissions) == 0:
600
- return None, None, None
601
- job_submission = job.job_submissions[0]
602
- return (
603
- job_submission.termination_reason,
604
- job_submission.termination_reason_message,
605
- job_submission.exit_status,
596
+ def _is_ready_to_attach(run: Run) -> bool:
597
+ return not (
598
+ run.status
599
+ in [
600
+ RunStatus.SUBMITTED,
601
+ RunStatus.PENDING,
602
+ RunStatus.PROVISIONING,
603
+ RunStatus.TERMINATING,
604
+ ]
605
+ or run._run.jobs[0].job_submissions[-1].status
606
+ in [JobStatus.SUBMITTED, JobStatus.PROVISIONING, JobStatus.PULLING]
606
607
  )
607
608
 
608
609
 
@@ -12,7 +12,6 @@ from dstack._internal.core.models.profiles import (
12
12
  TerminationPolicy,
13
13
  )
14
14
  from dstack._internal.core.models.runs import (
15
- Job,
16
15
  RunPlan,
17
16
  )
18
17
  from dstack._internal.core.services.profiles import get_termination
@@ -154,8 +153,7 @@ def get_runs_table(
154
153
  table.add_column("BACKEND", style="grey58", ratio=2)
155
154
  table.add_column("RESOURCES", ratio=3 if not verbose else 2)
156
155
  if verbose:
157
- table.add_column("INSTANCE", no_wrap=True, ratio=1)
158
- table.add_column("RESERVATION", no_wrap=True, ratio=1)
156
+ table.add_column("INSTANCE TYPE", no_wrap=True, ratio=1)
159
157
  table.add_column("PRICE", style="grey58", ratio=1)
160
158
  table.add_column("STATUS", no_wrap=True, ratio=1)
161
159
  table.add_column("SUBMITTED", style="grey58", no_wrap=True, ratio=1)
@@ -163,14 +161,14 @@ def get_runs_table(
163
161
  table.add_column("ERROR", no_wrap=True, ratio=2)
164
162
 
165
163
  for run in runs:
166
- run_error = _get_run_error(run)
167
164
  run = run._run # TODO(egor-s): make public attribute
168
165
 
169
166
  run_row: Dict[Union[str, int], Any] = {
170
167
  "NAME": run.run_spec.run_name,
171
168
  "SUBMITTED": format_date(run.submitted_at),
172
- "ERROR": run_error,
173
169
  }
170
+ if run.error:
171
+ run_row["ERROR"] = run.error
174
172
  if len(run.jobs) != 1:
175
173
  run_row["STATUS"] = run.status
176
174
  add_row_from_dict(table, run_row)
@@ -183,25 +181,26 @@ def get_runs_table(
183
181
  status += f" (inactive for {inactive_for})"
184
182
  job_row: Dict[Union[str, int], Any] = {
185
183
  "NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
186
- "STATUS": status,
184
+ "STATUS": latest_job_submission.status_message,
187
185
  "SUBMITTED": format_date(latest_job_submission.submitted_at),
188
- "ERROR": _get_job_error(job),
186
+ "ERROR": latest_job_submission.error,
189
187
  }
190
188
  jpd = latest_job_submission.job_provisioning_data
191
189
  if jpd is not None:
192
190
  resources = jpd.instance_type.resources
193
- instance = jpd.instance_type.name
191
+ instance_type = jpd.instance_type.name
194
192
  jrd = latest_job_submission.job_runtime_data
195
193
  if jrd is not None and jrd.offer is not None:
196
194
  resources = jrd.offer.instance.resources
197
195
  if jrd.offer.total_blocks > 1:
198
- instance += f" ({jrd.offer.blocks}/{jrd.offer.total_blocks})"
196
+ instance_type += f" ({jrd.offer.blocks}/{jrd.offer.total_blocks})"
197
+ if jpd.reservation:
198
+ instance_type += f" ({jpd.reservation})"
199
199
  job_row.update(
200
200
  {
201
201
  "BACKEND": f"{jpd.backend.value.replace('remote', 'ssh')} ({jpd.region})",
202
202
  "RESOURCES": resources.pretty_format(include_spot=True),
203
- "INSTANCE": instance,
204
- "RESERVATION": jpd.reservation,
203
+ "INSTANCE TYPE": instance_type,
205
204
  "PRICE": f"${jpd.price:.4f}".rstrip("0").rstrip("."),
206
205
  }
207
206
  )
@@ -211,18 +210,3 @@ def get_runs_table(
211
210
  add_row_from_dict(table, job_row, style="secondary" if len(run.jobs) != 1 else None)
212
211
 
213
212
  return table
214
-
215
-
216
- def _get_run_error(run: Run) -> str:
217
- return run._run.error or ""
218
-
219
-
220
- def _get_job_error(job: Job) -> str:
221
- job_submission = job.job_submissions[-1]
222
- termination_reason = job_submission.termination_reason
223
- exit_status = job_submission.exit_status
224
- if termination_reason is None:
225
- return ""
226
- if exit_status:
227
- return f"{termination_reason.name} {exit_status}"
228
- return termination_reason.name
@@ -19,9 +19,6 @@ from dstack._internal.core.models.backends.base import (
19
19
  BackendType,
20
20
  )
21
21
 
22
- # TODO: Add all supported regions and default regions
23
- REGIONS = []
24
-
25
22
 
26
23
  class {{ backend_name }}Configurator(Configurator):
27
24
  TYPE = BackendType.{{ backend_name|upper }}
@@ -31,13 +28,11 @@ class {{ backend_name }}Configurator(Configurator):
31
28
  self, config: {{ backend_name }}BackendConfigWithCreds, default_creds_enabled: bool
32
29
  ):
33
30
  self._validate_creds(config.creds)
34
- # TODO: Validate additional config parameters if any
31
+ # TODO: If possible, validate config.regions and any other config parameters
35
32
 
36
33
  def create_backend(
37
34
  self, project_name: str, config: {{ backend_name }}BackendConfigWithCreds
38
35
  ) -> BackendRecord:
39
- if config.regions is None:
40
- config.regions = REGIONS
41
36
  return BackendRecord(
42
37
  config={{ backend_name }}StoredConfig(
43
38
  **{{ backend_name }}BackendConfig.__response__.parse_obj(config).dict()
@@ -22,6 +22,7 @@ class {{ backend_name }}BackendConfig(CoreModel):
22
22
  It also serves as a base class for other backend config models.
23
23
  Should not include creds.
24
24
  """
25
+
25
26
  type: Annotated[
26
27
  Literal["{{ backend_name|lower }}"],
27
28
  Field(description="The type of backend"),
@@ -37,6 +38,7 @@ class {{ backend_name }}BackendConfigWithCreds({{ backend_name }}BackendConfig):
37
38
  """
38
39
  Same as `{{ backend_name }}BackendConfig` but also includes creds.
39
40
  """
41
+
40
42
  creds: Annotated[Any{{ backend_name }}Creds, Field(description="The credentials")]
41
43
 
42
44
 
@@ -48,6 +50,7 @@ class {{ backend_name }}StoredConfig({{ backend_name }}BackendConfig):
48
50
  The backend config used for config parameters in the DB.
49
51
  Can extend `{{ backend_name }}BackendConfig` with additional parameters.
50
52
  """
53
+
51
54
  pass
52
55
 
53
56
 
@@ -55,4 +58,5 @@ class {{ backend_name }}Config({{ backend_name }}StoredConfig):
55
58
  """
56
59
  The backend config used by `{{ backend_name }}Backend` and `{{ backend_name }}Compute`.
57
60
  """
61
+
58
62
  creds: Any{{ backend_name }}Creds
@@ -440,7 +440,7 @@ class ServiceConfigurationParams(CoreModel):
440
440
  raise ValueError("The minimum number of replicas must be greater than or equal to 0")
441
441
  if v.max < v.min:
442
442
  raise ValueError(
443
- "The maximum number of replicas must be greater than or equal to the minium number of replicas"
443
+ "The maximum number of replicas must be greater than or equal to the minimum number of replicas"
444
444
  )
445
445
  return v
446
446
 
@@ -20,6 +20,7 @@ from dstack._internal.core.models.profiles import (
20
20
  parse_idle_duration,
21
21
  )
22
22
  from dstack._internal.core.models.resources import Range, ResourcesSpec
23
+ from dstack._internal.utils.common import list_enum_values_for_annotation
23
24
  from dstack._internal.utils.json_schema import add_extra_schema_types
24
25
  from dstack._internal.utils.tags import tags_validator
25
26
 
@@ -207,7 +208,11 @@ class InstanceGroupParams(CoreModel):
207
208
  spot_policy: Annotated[
208
209
  Optional[SpotPolicy],
209
210
  Field(
210
- description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`"
211
+ description=(
212
+ "The policy for provisioning spot or on-demand instances:"
213
+ f" {list_enum_values_for_annotation(SpotPolicy)}."
214
+ f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
215
+ )
211
216
  ),
212
217
  ] = None
213
218
  retry: Annotated[
@@ -6,6 +6,7 @@ from typing_extensions import Annotated, Literal
6
6
 
7
7
  from dstack._internal.core.models.backends.base import BackendType
8
8
  from dstack._internal.core.models.common import CoreModel, Duration
9
+ from dstack._internal.utils.common import list_enum_values_for_annotation
9
10
  from dstack._internal.utils.tags import tags_validator
10
11
 
11
12
  DEFAULT_RETRY_DURATION = 3600
@@ -32,6 +33,17 @@ class TerminationPolicy(str, Enum):
32
33
  DESTROY_AFTER_IDLE = "destroy-after-idle"
33
34
 
34
35
 
36
+ class StartupOrder(str, Enum):
37
+ ANY = "any"
38
+ MASTER_FIRST = "master-first"
39
+ WORKERS_FIRST = "workers-first"
40
+
41
+
42
+ class StopCriteria(str, Enum):
43
+ ALL_DONE = "all-done"
44
+ MASTER_DONE = "master-done"
45
+
46
+
35
47
  @overload
36
48
  def parse_duration(v: None) -> None: ...
37
49
 
@@ -102,7 +114,7 @@ class ProfileRetry(CoreModel):
102
114
  Field(
103
115
  description=(
104
116
  "The list of events that should be handled with retry."
105
- " Supported events are `no-capacity`, `interruption`, and `error`."
117
+ f" Supported events are {list_enum_values_for_annotation(RetryEvent)}."
106
118
  " Omit to retry on all events"
107
119
  )
108
120
  ),
@@ -190,7 +202,11 @@ class ProfileParams(CoreModel):
190
202
  spot_policy: Annotated[
191
203
  Optional[SpotPolicy],
192
204
  Field(
193
- description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`. Defaults to `on-demand`"
205
+ description=(
206
+ "The policy for provisioning spot or on-demand instances:"
207
+ f" {list_enum_values_for_annotation(SpotPolicy)}."
208
+ f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
209
+ )
194
210
  ),
195
211
  ] = None
196
212
  retry: Annotated[
@@ -225,7 +241,11 @@ class ProfileParams(CoreModel):
225
241
  creation_policy: Annotated[
226
242
  Optional[CreationPolicy],
227
243
  Field(
228
- description="The policy for using instances from fleets. Defaults to `reuse-or-create`"
244
+ description=(
245
+ "The policy for using instances from fleets:"
246
+ f" {list_enum_values_for_annotation(CreationPolicy)}."
247
+ f" Defaults to `{CreationPolicy.REUSE_OR_CREATE.value}`"
248
+ )
229
249
  ),
230
250
  ] = None
231
251
  idle_duration: Annotated[
@@ -241,6 +261,26 @@ class ProfileParams(CoreModel):
241
261
  Optional[UtilizationPolicy],
242
262
  Field(description="Run termination policy based on utilization"),
243
263
  ] = None
264
+ startup_order: Annotated[
265
+ Optional[StartupOrder],
266
+ Field(
267
+ description=(
268
+ f"The order in which master and workers jobs are started:"
269
+ f" {list_enum_values_for_annotation(StartupOrder)}."
270
+ f" Defaults to `{StartupOrder.ANY.value}`"
271
+ )
272
+ ),
273
+ ] = None
274
+ stop_criteria: Annotated[
275
+ Optional[StopCriteria],
276
+ Field(
277
+ description=(
278
+ "The criteria determining when a multi-node run should be considered finished:"
279
+ f" {list_enum_values_for_annotation(StopCriteria)}."
280
+ f" Defaults to `{StopCriteria.ALL_DONE.value}`"
281
+ )
282
+ ),
283
+ ] = None
244
284
  fleets: Annotated[
245
285
  Optional[list[str]], Field(description="The fleets considered for reuse")
246
286
  ] = None
@@ -2,13 +2,18 @@ import tarfile
2
2
  from pathlib import Path
3
3
  from typing import BinaryIO, Optional
4
4
 
5
+ import ignore
6
+ import ignore.overrides
5
7
  from typing_extensions import Literal
6
8
 
7
9
  from dstack._internal.core.models.repos.base import BaseRepoInfo, Repo
10
+ from dstack._internal.utils.common import sizeof_fmt
8
11
  from dstack._internal.utils.hash import get_sha256, slugify
9
- from dstack._internal.utils.ignore import GitIgnore
12
+ from dstack._internal.utils.logging import get_logger
10
13
  from dstack._internal.utils.path import PathLike
11
14
 
15
+ logger = get_logger(__name__)
16
+
12
17
 
13
18
  class LocalRepoInfo(BaseRepoInfo):
14
19
  repo_type: Literal["local"] = "local"
@@ -69,22 +74,23 @@ class LocalRepo(Repo):
69
74
  self.run_repo_data = repo_data
70
75
 
71
76
  def write_code_file(self, fp: BinaryIO) -> str:
77
+ repo_path = Path(self.run_repo_data.repo_dir)
72
78
  with tarfile.TarFile(mode="w", fileobj=fp) as t:
73
- t.add(
74
- self.run_repo_data.repo_dir,
75
- arcname="",
76
- filter=TarIgnore(self.run_repo_data.repo_dir, globs=[".git"]),
77
- )
79
+ for entry in (
80
+ ignore.WalkBuilder(repo_path)
81
+ .overrides(ignore.overrides.OverrideBuilder(repo_path).add("!/.git/").build())
82
+ .hidden(False) # do not ignore files that start with a dot
83
+ .require_git(False) # respect git ignore rules even if not a git repo
84
+ .add_custom_ignore_filename(".dstackignore")
85
+ .build()
86
+ ):
87
+ entry_path_within_repo = entry.path().relative_to(repo_path)
88
+ if entry_path_within_repo != Path("."):
89
+ t.add(entry.path(), arcname=entry_path_within_repo, recursive=False)
90
+ logger.debug("Code file size: %s", sizeof_fmt(fp.tell()))
78
91
  return get_sha256(fp)
79
92
 
80
93
  def get_repo_info(self) -> LocalRepoInfo:
81
94
  return LocalRepoInfo(
82
95
  repo_dir=self.run_repo_data.repo_dir,
83
96
  )
84
-
85
-
86
- class TarIgnore(GitIgnore):
87
- def __call__(self, tarinfo: tarfile.TarInfo) -> Optional[tarfile.TarInfo]:
88
- if self.ignore(tarinfo.path):
89
- return None
90
- return tarinfo
@@ -148,9 +148,6 @@ class JobTerminationReason(str, Enum):
148
148
  }
149
149
  return mapping[self]
150
150
 
151
- def pretty_repr(self) -> str:
152
- return " ".join(self.value.split("_")).capitalize()
153
-
154
151
 
155
152
  class Requirements(CoreModel):
156
153
  # TODO: Make requirements' fields required
@@ -289,6 +286,9 @@ class JobSubmission(CoreModel):
289
286
  exit_status: Optional[int]
290
287
  job_provisioning_data: Optional[JobProvisioningData]
291
288
  job_runtime_data: Optional[JobRuntimeData]
289
+ # TODO: make status_message and error a computed field after migrating to pydanticV2
290
+ status_message: Optional[str]
291
+ error: Optional[str] = None
292
292
 
293
293
  @property
294
294
  def age(self) -> timedelta:
@@ -301,6 +301,71 @@ class JobSubmission(CoreModel):
301
301
  end_time = self.finished_at
302
302
  return end_time - self.submitted_at
303
303
 
304
+ @root_validator
305
+ def _status_message(cls, values) -> Dict:
306
+ try:
307
+ status = values["status"]
308
+ termination_reason = values["termination_reason"]
309
+ exit_code = values["exit_status"]
310
+ except KeyError:
311
+ return values
312
+ values["status_message"] = JobSubmission._get_status_message(
313
+ status=status,
314
+ termination_reason=termination_reason,
315
+ exit_status=exit_code,
316
+ )
317
+ return values
318
+
319
+ @staticmethod
320
+ def _get_status_message(
321
+ status: JobStatus,
322
+ termination_reason: Optional[JobTerminationReason],
323
+ exit_status: Optional[int],
324
+ ) -> str:
325
+ if status == JobStatus.DONE:
326
+ return "exited (0)"
327
+ elif status == JobStatus.FAILED:
328
+ if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
329
+ return f"exited ({exit_status})"
330
+ elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
331
+ return "no offers"
332
+ elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
333
+ return "interrupted"
334
+ else:
335
+ return "error"
336
+ elif status == JobStatus.TERMINATED:
337
+ if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
338
+ return "stopped"
339
+ elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
340
+ return "aborted"
341
+ return status.value
342
+
343
+ @root_validator
344
+ def _error(cls, values) -> Dict:
345
+ try:
346
+ termination_reason = values["termination_reason"]
347
+ except KeyError:
348
+ return values
349
+ values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
350
+ return values
351
+
352
+ @staticmethod
353
+ def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
354
+ error_mapping = {
355
+ JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
356
+ JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
357
+ JobTerminationReason.VOLUME_ERROR: "waiting runner limit exceeded",
358
+ JobTerminationReason.GATEWAY_ERROR: "gateway error",
359
+ JobTerminationReason.SCALED_DOWN: "scaled down",
360
+ JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
361
+ JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
362
+ JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
363
+ JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
364
+ JobTerminationReason.EXECUTOR_ERROR: "executor error",
365
+ JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
366
+ }
367
+ return error_mapping.get(termination_reason)
368
+
304
369
 
305
370
  class Job(CoreModel):
306
371
  job_spec: JobSpec
@@ -445,15 +510,20 @@ class Run(CoreModel):
445
510
  def _error(cls, values) -> Dict:
446
511
  try:
447
512
  termination_reason = values["termination_reason"]
448
- jobs = values["jobs"]
449
513
  except KeyError:
450
514
  return values
451
- values["error"] = _get_run_error(
452
- run_termination_reason=termination_reason,
453
- run_jobs=jobs,
454
- )
515
+ values["error"] = Run._get_error(termination_reason=termination_reason)
455
516
  return values
456
517
 
518
+ @staticmethod
519
+ def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
520
+ if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
521
+ return "retry limit exceeded"
522
+ elif termination_reason == RunTerminationReason.SERVER_ERROR:
523
+ return "server error"
524
+ else:
525
+ return None
526
+
457
527
 
458
528
  class JobPlan(CoreModel):
459
529
  job_spec: JobSpec
@@ -502,40 +572,3 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
502
572
  SpotPolicy.ONDEMAND: False,
503
573
  }
504
574
  return policy_map[spot_policy]
505
-
506
-
507
- def _get_run_error(
508
- run_termination_reason: Optional[RunTerminationReason],
509
- run_jobs: List[Job],
510
- ) -> str:
511
- if run_termination_reason is None:
512
- return ""
513
- if len(run_jobs) > 1:
514
- return run_termination_reason.name
515
- run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
516
- run_jobs
517
- )
518
- # For failed runs, also show termination reason to provide more context.
519
- # For other run statuses, the job termination reason will duplicate run status.
520
- if run_job_termination_reason is not None and run_termination_reason in [
521
- RunTerminationReason.JOB_FAILED,
522
- RunTerminationReason.SERVER_ERROR,
523
- RunTerminationReason.RETRY_LIMIT_EXCEEDED,
524
- ]:
525
- if exit_status:
526
- return (
527
- f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
528
- )
529
- return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
530
- return run_termination_reason.name
531
-
532
-
533
- def _get_run_job_termination_reason_and_exit_status(
534
- run_jobs: List[Job],
535
- ) -> tuple[Optional[JobTerminationReason], Optional[int]]:
536
- for job in run_jobs:
537
- if len(job.job_submissions) > 0:
538
- job_submission = job.job_submissions[-1]
539
- if job_submission.termination_reason is not None:
540
- return job_submission.termination_reason, job_submission.exit_status
541
- return None, None