dstack 0.19.12rc1__py3-none-any.whl → 0.19.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (62) hide show
  1. dstack/_internal/cli/commands/attach.py +4 -4
  2. dstack/_internal/cli/services/configurators/run.py +44 -47
  3. dstack/_internal/cli/utils/run.py +31 -31
  4. dstack/_internal/core/backends/aws/compute.py +22 -9
  5. dstack/_internal/core/backends/aws/resources.py +26 -0
  6. dstack/_internal/core/backends/base/offers.py +0 -1
  7. dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
  8. dstack/_internal/core/backends/template/models.py.jinja +4 -0
  9. dstack/_internal/core/compatibility/__init__.py +0 -0
  10. dstack/_internal/core/compatibility/fleets.py +72 -0
  11. dstack/_internal/core/compatibility/gateways.py +34 -0
  12. dstack/_internal/core/compatibility/runs.py +131 -0
  13. dstack/_internal/core/compatibility/volumes.py +32 -0
  14. dstack/_internal/core/models/configurations.py +1 -1
  15. dstack/_internal/core/models/fleets.py +6 -1
  16. dstack/_internal/core/models/instances.py +51 -12
  17. dstack/_internal/core/models/profiles.py +43 -3
  18. dstack/_internal/core/models/projects.py +1 -0
  19. dstack/_internal/core/models/repos/local.py +3 -3
  20. dstack/_internal/core/models/runs.py +139 -43
  21. dstack/_internal/server/app.py +46 -1
  22. dstack/_internal/server/background/tasks/process_running_jobs.py +92 -15
  23. dstack/_internal/server/background/tasks/process_runs.py +163 -80
  24. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
  25. dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
  26. dstack/_internal/server/models.py +4 -0
  27. dstack/_internal/server/routers/projects.py +4 -3
  28. dstack/_internal/server/routers/prometheus.py +4 -1
  29. dstack/_internal/server/schemas/projects.py +1 -0
  30. dstack/_internal/server/security/permissions.py +36 -0
  31. dstack/_internal/server/services/jobs/__init__.py +1 -0
  32. dstack/_internal/server/services/jobs/configurators/base.py +11 -7
  33. dstack/_internal/server/services/projects.py +54 -1
  34. dstack/_internal/server/services/runner/client.py +4 -1
  35. dstack/_internal/server/services/runs.py +49 -29
  36. dstack/_internal/server/services/services/__init__.py +19 -0
  37. dstack/_internal/server/services/services/autoscalers.py +37 -26
  38. dstack/_internal/server/services/storage/__init__.py +38 -0
  39. dstack/_internal/server/services/storage/base.py +27 -0
  40. dstack/_internal/server/services/storage/gcs.py +44 -0
  41. dstack/_internal/server/services/{storage.py → storage/s3.py} +4 -27
  42. dstack/_internal/server/settings.py +7 -3
  43. dstack/_internal/server/statics/index.html +1 -1
  44. dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js → main-0ac1e1583684417ae4d1.js} +1695 -62
  45. dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js.map → main-0ac1e1583684417ae4d1.js.map} +1 -1
  46. dstack/_internal/server/statics/{main-8f9c66f404e9c7e7e020.css → main-f39c418b05fe14772dd8.css} +1 -1
  47. dstack/_internal/server/testing/common.py +11 -1
  48. dstack/_internal/settings.py +3 -0
  49. dstack/_internal/utils/common.py +4 -0
  50. dstack/api/_public/runs.py +14 -5
  51. dstack/api/server/_fleets.py +9 -69
  52. dstack/api/server/_gateways.py +3 -14
  53. dstack/api/server/_projects.py +2 -2
  54. dstack/api/server/_runs.py +4 -116
  55. dstack/api/server/_volumes.py +3 -14
  56. dstack/plugins/builtin/rest_plugin/_plugin.py +24 -5
  57. dstack/version.py +2 -2
  58. {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/METADATA +1 -1
  59. {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/RECORD +62 -52
  60. {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/WHEEL +0 -0
  61. {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/entry_points.txt +0 -0
  62. {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,131 @@
1
+ from typing import Any, Dict, Optional
2
+
3
+ from dstack._internal.core.models.configurations import ServiceConfiguration
4
+ from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSubmission, RunSpec
5
+ from dstack._internal.server.schemas.runs import GetRunPlanRequest
6
+
7
+
8
+ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
9
+ """
10
+ Returns `plan` exclude mapping to exclude certain fields from the request.
11
+ Use this method to exclude new fields when they are not set to keep
12
+ clients backward-compatibility with older servers.
13
+ """
14
+ apply_plan_excludes = {}
15
+ run_spec_excludes = get_run_spec_excludes(plan.run_spec)
16
+ if run_spec_excludes is not None:
17
+ apply_plan_excludes["run_spec"] = run_spec_excludes
18
+ current_resource = plan.current_resource
19
+ if current_resource is not None:
20
+ current_resource_excludes = {}
21
+ current_resource_excludes["status_message"] = True
22
+ if current_resource.deployment_num == 0:
23
+ current_resource_excludes["deployment_num"] = True
24
+ apply_plan_excludes["current_resource"] = current_resource_excludes
25
+ current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
26
+ job_submissions_excludes = {}
27
+ current_resource_excludes["jobs"] = {
28
+ "__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
29
+ }
30
+ job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
31
+ if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
32
+ job_submissions_excludes["job_provisioning_data"] = {
33
+ "instance_type": {"resources": {"cpu_arch"}}
34
+ }
35
+ if all(map(_should_exclude_job_submission_jrd_cpu_arch, job_submissions)):
36
+ job_submissions_excludes["job_runtime_data"] = {
37
+ "offer": {"instance": {"resources": {"cpu_arch"}}}
38
+ }
39
+ if all(js.exit_status is None for js in job_submissions):
40
+ job_submissions_excludes["exit_status"] = True
41
+ if all(js.deployment_num == 0 for js in job_submissions):
42
+ job_submissions_excludes["deployment_num"] = True
43
+ latest_job_submission = current_resource.latest_job_submission
44
+ if latest_job_submission is not None:
45
+ latest_job_submission_excludes = {}
46
+ current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
47
+ if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
48
+ latest_job_submission_excludes["job_provisioning_data"] = {
49
+ "instance_type": {"resources": {"cpu_arch"}}
50
+ }
51
+ if _should_exclude_job_submission_jrd_cpu_arch(latest_job_submission):
52
+ latest_job_submission_excludes["job_runtime_data"] = {
53
+ "offer": {"instance": {"resources": {"cpu_arch"}}}
54
+ }
55
+ if latest_job_submission.exit_status is None:
56
+ latest_job_submission_excludes["exit_status"] = True
57
+ if latest_job_submission.deployment_num == 0:
58
+ latest_job_submission_excludes["deployment_num"] = True
59
+ return {"plan": apply_plan_excludes}
60
+
61
+
62
+ def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
63
+ """
64
+ Excludes new fields when they are not set to keep
65
+ clients backward-compatibility with older servers.
66
+ """
67
+ get_plan_excludes = {}
68
+ run_spec_excludes = get_run_spec_excludes(request.run_spec)
69
+ if run_spec_excludes is not None:
70
+ get_plan_excludes["run_spec"] = run_spec_excludes
71
+ if request.max_offers is None:
72
+ get_plan_excludes["max_offers"] = True
73
+ return get_plan_excludes
74
+
75
+
76
+ def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
77
+ """
78
+ Returns `run_spec` exclude mapping to exclude certain fields from the request.
79
+ Use this method to exclude new fields when they are not set to keep
80
+ clients backward-compatibility with older servers.
81
+ """
82
+ spec_excludes: dict[str, Any] = {}
83
+ configuration_excludes: dict[str, Any] = {}
84
+ profile_excludes: set[str] = set()
85
+ configuration = run_spec.configuration
86
+ profile = run_spec.profile
87
+
88
+ if configuration.fleets is None:
89
+ configuration_excludes["fleets"] = True
90
+ if profile is not None and profile.fleets is None:
91
+ profile_excludes.add("fleets")
92
+ if configuration.tags is None:
93
+ configuration_excludes["tags"] = True
94
+ if profile is not None and profile.tags is None:
95
+ profile_excludes.add("tags")
96
+ if isinstance(configuration, ServiceConfiguration) and not configuration.rate_limits:
97
+ configuration_excludes["rate_limits"] = True
98
+ if configuration.shell is None:
99
+ configuration_excludes["shell"] = True
100
+ if configuration.priority is None:
101
+ configuration_excludes["priority"] = True
102
+ if configuration.startup_order is None:
103
+ configuration_excludes["startup_order"] = True
104
+ if profile is not None and profile.startup_order is None:
105
+ profile_excludes.add("startup_order")
106
+ if configuration.stop_criteria is None:
107
+ configuration_excludes["stop_criteria"] = True
108
+ if profile is not None and profile.stop_criteria is None:
109
+ profile_excludes.add("stop_criteria")
110
+
111
+ if configuration_excludes:
112
+ spec_excludes["configuration"] = configuration_excludes
113
+ if profile_excludes:
114
+ spec_excludes["profile"] = profile_excludes
115
+ if spec_excludes:
116
+ return spec_excludes
117
+ return None
118
+
119
+
120
+ def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
121
+ try:
122
+ return job_submission.job_provisioning_data.instance_type.resources.cpu_arch is None
123
+ except AttributeError:
124
+ return True
125
+
126
+
127
+ def _should_exclude_job_submission_jrd_cpu_arch(job_submission: JobSubmission) -> bool:
128
+ try:
129
+ return job_submission.job_runtime_data.offer.instance.resources.cpu_arch is None
130
+ except AttributeError:
131
+ return True
@@ -0,0 +1,32 @@
1
+ from typing import Dict
2
+
3
+ from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeSpec
4
+
5
+
6
+ def get_volume_spec_excludes(volume_spec: VolumeSpec) -> Dict:
7
+ """
8
+ Returns `volume_spec` exclude mapping to exclude certain fields from the request.
9
+ Use this method to exclude new fields when they are not set to keep
10
+ clients backward-compatibility with older servers.
11
+ """
12
+ spec_excludes = {}
13
+ spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration)
14
+ return spec_excludes
15
+
16
+
17
+ def get_create_volume_excludes(configuration: VolumeConfiguration) -> Dict:
18
+ """
19
+ Returns an exclude mapping to exclude certain fields from the create volume request.
20
+ Use this method to exclude new fields when they are not set to keep
21
+ clients backward-compatibility with older servers.
22
+ """
23
+ create_volume_excludes = {}
24
+ create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration)
25
+ return create_volume_excludes
26
+
27
+
28
+ def _get_volume_configuration_excludes(configuration: VolumeConfiguration) -> Dict:
29
+ configuration_excludes = {}
30
+ if configuration.tags is None:
31
+ configuration_excludes["tags"] = True
32
+ return configuration_excludes
@@ -440,7 +440,7 @@ class ServiceConfigurationParams(CoreModel):
440
440
  raise ValueError("The minimum number of replicas must be greater than or equal to 0")
441
441
  if v.max < v.min:
442
442
  raise ValueError(
443
- "The maximum number of replicas must be greater than or equal to the minium number of replicas"
443
+ "The maximum number of replicas must be greater than or equal to the minimum number of replicas"
444
444
  )
445
445
  return v
446
446
 
@@ -20,6 +20,7 @@ from dstack._internal.core.models.profiles import (
20
20
  parse_idle_duration,
21
21
  )
22
22
  from dstack._internal.core.models.resources import Range, ResourcesSpec
23
+ from dstack._internal.utils.common import list_enum_values_for_annotation
23
24
  from dstack._internal.utils.json_schema import add_extra_schema_types
24
25
  from dstack._internal.utils.tags import tags_validator
25
26
 
@@ -207,7 +208,11 @@ class InstanceGroupParams(CoreModel):
207
208
  spot_policy: Annotated[
208
209
  Optional[SpotPolicy],
209
210
  Field(
210
- description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`"
211
+ description=(
212
+ "The policy for provisioning spot or on-demand instances:"
213
+ f" {list_enum_values_for_annotation(SpotPolicy)}."
214
+ f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
215
+ )
211
216
  ),
212
217
  ] = None
213
218
  retry: Annotated[
@@ -48,29 +48,68 @@ class Resources(CoreModel):
48
48
  gpus: List[Gpu]
49
49
  spot: bool
50
50
  disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
51
+ # TODO: make description a computed field after migrating to pydanticV2
51
52
  description: str = ""
52
53
  cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
53
54
 
54
- def pretty_format(self, include_spot: bool = False) -> str:
55
+ @root_validator
56
+ def _description(cls, values) -> Dict:
57
+ try:
58
+ description = values["description"]
59
+ if not description:
60
+ cpus = values["cpus"]
61
+ memory_mib = values["memory_mib"]
62
+ gpus = values["gpus"]
63
+ disk_size_mib = values["disk"].size_mib
64
+ spot = values["spot"]
65
+ cpu_arch = values["cpu_arch"]
66
+ values["description"] = Resources._pretty_format(
67
+ cpus, cpu_arch, memory_mib, disk_size_mib, gpus, spot, include_spot=True
68
+ )
69
+ except KeyError:
70
+ return values
71
+ return values
72
+
73
+ @staticmethod
74
+ def _pretty_format(
75
+ cpus: int,
76
+ cpu_arch: Optional[gpuhunt.CPUArchitecture],
77
+ memory_mib: int,
78
+ disk_size_mib: int,
79
+ gpus: List[Gpu],
80
+ spot: bool,
81
+ include_spot: bool = False,
82
+ ) -> str:
55
83
  resources = {}
56
- if self.cpus > 0:
57
- resources["cpus"] = self.cpus
58
- resources["cpu_arch"] = self.cpu_arch
59
- if self.memory_mib > 0:
60
- resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
61
- if self.disk.size_mib > 0:
62
- resources["disk_size"] = f"{self.disk.size_mib / 1024:.0f}GB"
63
- if self.gpus:
64
- gpu = self.gpus[0]
84
+ if cpus > 0:
85
+ resources["cpus"] = cpus
86
+ resources["cpu_arch"] = cpu_arch
87
+ if memory_mib > 0:
88
+ resources["memory"] = f"{memory_mib / 1024:.0f}GB"
89
+ if disk_size_mib > 0:
90
+ resources["disk_size"] = f"{disk_size_mib / 1024:.0f}GB"
91
+ if gpus:
92
+ gpu = gpus[0]
65
93
  resources["gpu_name"] = gpu.name
66
- resources["gpu_count"] = len(self.gpus)
94
+ resources["gpu_count"] = len(gpus)
67
95
  if gpu.memory_mib > 0:
68
96
  resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB"
69
97
  output = pretty_resources(**resources)
70
- if include_spot and self.spot:
98
+ if include_spot and spot:
71
99
  output += " (spot)"
72
100
  return output
73
101
 
102
+ def pretty_format(self, include_spot: bool = False) -> str:
103
+ return Resources._pretty_format(
104
+ self.cpus,
105
+ self.cpu_arch,
106
+ self.memory_mib,
107
+ self.disk.size_mib,
108
+ self.gpus,
109
+ self.spot,
110
+ include_spot,
111
+ )
112
+
74
113
 
75
114
  class InstanceType(CoreModel):
76
115
  name: str
@@ -6,6 +6,7 @@ from typing_extensions import Annotated, Literal
6
6
 
7
7
  from dstack._internal.core.models.backends.base import BackendType
8
8
  from dstack._internal.core.models.common import CoreModel, Duration
9
+ from dstack._internal.utils.common import list_enum_values_for_annotation
9
10
  from dstack._internal.utils.tags import tags_validator
10
11
 
11
12
  DEFAULT_RETRY_DURATION = 3600
@@ -32,6 +33,17 @@ class TerminationPolicy(str, Enum):
32
33
  DESTROY_AFTER_IDLE = "destroy-after-idle"
33
34
 
34
35
 
36
+ class StartupOrder(str, Enum):
37
+ ANY = "any"
38
+ MASTER_FIRST = "master-first"
39
+ WORKERS_FIRST = "workers-first"
40
+
41
+
42
+ class StopCriteria(str, Enum):
43
+ ALL_DONE = "all-done"
44
+ MASTER_DONE = "master-done"
45
+
46
+
35
47
  @overload
36
48
  def parse_duration(v: None) -> None: ...
37
49
 
@@ -102,7 +114,7 @@ class ProfileRetry(CoreModel):
102
114
  Field(
103
115
  description=(
104
116
  "The list of events that should be handled with retry."
105
- " Supported events are `no-capacity`, `interruption`, and `error`."
117
+ f" Supported events are {list_enum_values_for_annotation(RetryEvent)}."
106
118
  " Omit to retry on all events"
107
119
  )
108
120
  ),
@@ -190,7 +202,11 @@ class ProfileParams(CoreModel):
190
202
  spot_policy: Annotated[
191
203
  Optional[SpotPolicy],
192
204
  Field(
193
- description="The policy for provisioning spot or on-demand instances: `spot`, `on-demand`, or `auto`. Defaults to `on-demand`"
205
+ description=(
206
+ "The policy for provisioning spot or on-demand instances:"
207
+ f" {list_enum_values_for_annotation(SpotPolicy)}."
208
+ f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
209
+ )
194
210
  ),
195
211
  ] = None
196
212
  retry: Annotated[
@@ -225,7 +241,11 @@ class ProfileParams(CoreModel):
225
241
  creation_policy: Annotated[
226
242
  Optional[CreationPolicy],
227
243
  Field(
228
- description="The policy for using instances from fleets. Defaults to `reuse-or-create`"
244
+ description=(
245
+ "The policy for using instances from fleets:"
246
+ f" {list_enum_values_for_annotation(CreationPolicy)}."
247
+ f" Defaults to `{CreationPolicy.REUSE_OR_CREATE.value}`"
248
+ )
229
249
  ),
230
250
  ] = None
231
251
  idle_duration: Annotated[
@@ -241,6 +261,26 @@ class ProfileParams(CoreModel):
241
261
  Optional[UtilizationPolicy],
242
262
  Field(description="Run termination policy based on utilization"),
243
263
  ] = None
264
+ startup_order: Annotated[
265
+ Optional[StartupOrder],
266
+ Field(
267
+ description=(
268
+ f"The order in which master and workers jobs are started:"
269
+ f" {list_enum_values_for_annotation(StartupOrder)}."
270
+ f" Defaults to `{StartupOrder.ANY.value}`"
271
+ )
272
+ ),
273
+ ] = None
274
+ stop_criteria: Annotated[
275
+ Optional[StopCriteria],
276
+ Field(
277
+ description=(
278
+ "The criteria determining when a multi-node run should be considered finished:"
279
+ f" {list_enum_values_for_annotation(StopCriteria)}."
280
+ f" Defaults to `{StopCriteria.ALL_DONE.value}`"
281
+ )
282
+ ),
283
+ ] = None
244
284
  fleets: Annotated[
245
285
  Optional[list[str]], Field(description="The fleets considered for reuse")
246
286
  ] = None
@@ -25,3 +25,4 @@ class Project(CoreModel):
25
25
  created_at: Optional[datetime] = None
26
26
  backends: List[BackendInfo]
27
27
  members: List[Member]
28
+ is_public: bool = False
@@ -84,9 +84,9 @@ class LocalRepo(Repo):
84
84
  .add_custom_ignore_filename(".dstackignore")
85
85
  .build()
86
86
  ):
87
- path = entry.path().relative_to(repo_path.absolute())
88
- if path != Path("."):
89
- t.add(path, recursive=False)
87
+ entry_path_within_repo = entry.path().relative_to(repo_path)
88
+ if entry_path_within_repo != Path("."):
89
+ t.add(entry.path(), arcname=entry_path_within_repo, recursive=False)
90
90
  logger.debug("Code file size: %s", sizeof_fmt(fp.tell()))
91
91
  return get_sha256(fp)
92
92
 
@@ -148,8 +148,18 @@ class JobTerminationReason(str, Enum):
148
148
  }
149
149
  return mapping[self]
150
150
 
151
- def pretty_repr(self) -> str:
152
- return " ".join(self.value.split("_")).capitalize()
151
+ def to_retry_event(self) -> Optional[RetryEvent]:
152
+ """
153
+ Returns:
154
+ the retry event this termination reason triggers
155
+ or None if this termination reason should not be retried
156
+ """
157
+ mapping = {
158
+ self.FAILED_TO_START_DUE_TO_NO_CAPACITY: RetryEvent.NO_CAPACITY,
159
+ self.INTERRUPTED_BY_NO_CAPACITY: RetryEvent.INTERRUPTION,
160
+ }
161
+ default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
162
+ return mapping.get(self, default)
153
163
 
154
164
 
155
165
  class Requirements(CoreModel):
@@ -279,6 +289,7 @@ class ClusterInfo(CoreModel):
279
289
  class JobSubmission(CoreModel):
280
290
  id: UUID4
281
291
  submission_num: int
292
+ deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
282
293
  submitted_at: datetime
283
294
  last_processed_at: datetime
284
295
  finished_at: Optional[datetime]
@@ -289,6 +300,9 @@ class JobSubmission(CoreModel):
289
300
  exit_status: Optional[int]
290
301
  job_provisioning_data: Optional[JobProvisioningData]
291
302
  job_runtime_data: Optional[JobRuntimeData]
303
+ # TODO: make status_message and error a computed field after migrating to pydanticV2
304
+ status_message: Optional[str]
305
+ error: Optional[str] = None
292
306
 
293
307
  @property
294
308
  def age(self) -> timedelta:
@@ -301,6 +315,71 @@ class JobSubmission(CoreModel):
301
315
  end_time = self.finished_at
302
316
  return end_time - self.submitted_at
303
317
 
318
+ @root_validator
319
+ def _status_message(cls, values) -> Dict:
320
+ try:
321
+ status = values["status"]
322
+ termination_reason = values["termination_reason"]
323
+ exit_code = values["exit_status"]
324
+ except KeyError:
325
+ return values
326
+ values["status_message"] = JobSubmission._get_status_message(
327
+ status=status,
328
+ termination_reason=termination_reason,
329
+ exit_status=exit_code,
330
+ )
331
+ return values
332
+
333
+ @staticmethod
334
+ def _get_status_message(
335
+ status: JobStatus,
336
+ termination_reason: Optional[JobTerminationReason],
337
+ exit_status: Optional[int],
338
+ ) -> str:
339
+ if status == JobStatus.DONE:
340
+ return "exited (0)"
341
+ elif status == JobStatus.FAILED:
342
+ if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
343
+ return f"exited ({exit_status})"
344
+ elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
345
+ return "no offers"
346
+ elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
347
+ return "interrupted"
348
+ else:
349
+ return "error"
350
+ elif status == JobStatus.TERMINATED:
351
+ if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
352
+ return "stopped"
353
+ elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
354
+ return "aborted"
355
+ return status.value
356
+
357
+ @root_validator
358
+ def _error(cls, values) -> Dict:
359
+ try:
360
+ termination_reason = values["termination_reason"]
361
+ except KeyError:
362
+ return values
363
+ values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
364
+ return values
365
+
366
+ @staticmethod
367
+ def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
368
+ error_mapping = {
369
+ JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
370
+ JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
371
+ JobTerminationReason.VOLUME_ERROR: "volume error",
372
+ JobTerminationReason.GATEWAY_ERROR: "gateway error",
373
+ JobTerminationReason.SCALED_DOWN: "scaled down",
374
+ JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
375
+ JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
376
+ JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
377
+ JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
378
+ JobTerminationReason.EXECUTOR_ERROR: "executor error",
379
+ JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
380
+ }
381
+ return error_mapping.get(termination_reason)
382
+
304
383
 
305
384
  class Job(CoreModel):
306
385
  job_spec: JobSpec
@@ -431,12 +510,14 @@ class Run(CoreModel):
431
510
  submitted_at: datetime
432
511
  last_processed_at: datetime
433
512
  status: RunStatus
513
+ status_message: Optional[str] = None
434
514
  termination_reason: Optional[RunTerminationReason]
435
515
  run_spec: RunSpec
436
516
  jobs: List[Job]
437
517
  latest_job_submission: Optional[JobSubmission]
438
518
  cost: float = 0
439
519
  service: Optional[ServiceSpec] = None
520
+ deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
440
521
  # TODO: make error a computed field after migrating to pydanticV2
441
522
  error: Optional[str] = None
442
523
  deleted: Optional[bool] = None
@@ -445,15 +526,67 @@ class Run(CoreModel):
445
526
  def _error(cls, values) -> Dict:
446
527
  try:
447
528
  termination_reason = values["termination_reason"]
448
- jobs = values["jobs"]
449
529
  except KeyError:
450
530
  return values
451
- values["error"] = _get_run_error(
452
- run_termination_reason=termination_reason,
453
- run_jobs=jobs,
531
+ values["error"] = Run._get_error(termination_reason=termination_reason)
532
+ return values
533
+
534
+ @staticmethod
535
+ def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
536
+ if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
537
+ return "retry limit exceeded"
538
+ elif termination_reason == RunTerminationReason.SERVER_ERROR:
539
+ return "server error"
540
+ else:
541
+ return None
542
+
543
+ @root_validator
544
+ def _status_message(cls, values) -> Dict:
545
+ try:
546
+ status = values["status"]
547
+ jobs: List[Job] = values["jobs"]
548
+ retry_on_events = (
549
+ jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
550
+ )
551
+ termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
552
+ except KeyError:
553
+ return values
554
+ values["status_message"] = Run._get_status_message(
555
+ status=status,
556
+ retry_on_events=retry_on_events,
557
+ termination_reason=termination_reason,
454
558
  )
455
559
  return values
456
560
 
561
+ @staticmethod
562
+ def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
563
+ for submission in reversed(job.job_submissions):
564
+ if submission.termination_reason is not None:
565
+ return submission.termination_reason
566
+ return None
567
+
568
+ @staticmethod
569
+ def _get_status_message(
570
+ status: RunStatus,
571
+ retry_on_events: List[RetryEvent],
572
+ termination_reason: Optional[JobTerminationReason],
573
+ ) -> str:
574
+ # Currently, `retrying` is shown only for `no-capacity` events
575
+ if (
576
+ status in [RunStatus.SUBMITTED, RunStatus.PENDING]
577
+ and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
578
+ and RetryEvent.NO_CAPACITY in retry_on_events
579
+ ):
580
+ return "retrying"
581
+ return status.value
582
+
583
+ def is_deployment_in_progress(self) -> bool:
584
+ return any(
585
+ not j.job_submissions[-1].status.is_finished()
586
+ and j.job_submissions[-1].deployment_num != self.deployment_num
587
+ for j in self.jobs
588
+ )
589
+
457
590
 
458
591
  class JobPlan(CoreModel):
459
592
  job_spec: JobSpec
@@ -502,40 +635,3 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
502
635
  SpotPolicy.ONDEMAND: False,
503
636
  }
504
637
  return policy_map[spot_policy]
505
-
506
-
507
- def _get_run_error(
508
- run_termination_reason: Optional[RunTerminationReason],
509
- run_jobs: List[Job],
510
- ) -> str:
511
- if run_termination_reason is None:
512
- return ""
513
- if len(run_jobs) > 1:
514
- return run_termination_reason.name
515
- run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
516
- run_jobs
517
- )
518
- # For failed runs, also show termination reason to provide more context.
519
- # For other run statuses, the job termination reason will duplicate run status.
520
- if run_job_termination_reason is not None and run_termination_reason in [
521
- RunTerminationReason.JOB_FAILED,
522
- RunTerminationReason.SERVER_ERROR,
523
- RunTerminationReason.RETRY_LIMIT_EXCEEDED,
524
- ]:
525
- if exit_status:
526
- return (
527
- f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
528
- )
529
- return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
530
- return run_termination_reason.name
531
-
532
-
533
- def _get_run_job_termination_reason_and_exit_status(
534
- run_jobs: List[Job],
535
- ) -> tuple[Optional[JobTerminationReason], Optional[int]]:
536
- for job in run_jobs:
537
- if len(job.job_submissions) > 0:
538
- job_submission = job.job_submissions[-1]
539
- if job_submission.termination_reason is not None:
540
- return job_submission.termination_reason, job_submission.exit_status
541
- return None, None