dstack 0.19.18__py3-none-any.whl → 0.19.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (69) hide show
  1. dstack/_internal/cli/services/configurators/fleet.py +99 -1
  2. dstack/_internal/cli/services/profile.py +1 -1
  3. dstack/_internal/core/compatibility/runs.py +12 -1
  4. dstack/_internal/core/compatibility/volumes.py +2 -0
  5. dstack/_internal/core/models/common.py +38 -2
  6. dstack/_internal/core/models/configurations.py +9 -1
  7. dstack/_internal/core/models/fleets.py +2 -1
  8. dstack/_internal/core/models/profiles.py +8 -5
  9. dstack/_internal/core/models/resources.py +15 -8
  10. dstack/_internal/core/models/runs.py +41 -138
  11. dstack/_internal/core/models/volumes.py +14 -0
  12. dstack/_internal/core/services/diff.py +30 -10
  13. dstack/_internal/core/services/ssh/attach.py +2 -0
  14. dstack/_internal/server/app.py +17 -9
  15. dstack/_internal/server/background/__init__.py +5 -3
  16. dstack/_internal/server/background/tasks/process_gateways.py +46 -28
  17. dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
  18. dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
  19. dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
  20. dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
  21. dstack/_internal/server/models.py +1 -0
  22. dstack/_internal/server/routers/backends.py +23 -16
  23. dstack/_internal/server/routers/files.py +7 -6
  24. dstack/_internal/server/routers/fleets.py +47 -36
  25. dstack/_internal/server/routers/gateways.py +27 -18
  26. dstack/_internal/server/routers/instances.py +18 -13
  27. dstack/_internal/server/routers/logs.py +7 -3
  28. dstack/_internal/server/routers/metrics.py +14 -8
  29. dstack/_internal/server/routers/projects.py +33 -22
  30. dstack/_internal/server/routers/repos.py +7 -6
  31. dstack/_internal/server/routers/runs.py +49 -28
  32. dstack/_internal/server/routers/secrets.py +20 -15
  33. dstack/_internal/server/routers/server.py +7 -4
  34. dstack/_internal/server/routers/users.py +22 -19
  35. dstack/_internal/server/routers/volumes.py +34 -25
  36. dstack/_internal/server/schemas/logs.py +2 -2
  37. dstack/_internal/server/schemas/runs.py +17 -5
  38. dstack/_internal/server/services/fleets.py +354 -72
  39. dstack/_internal/server/services/gateways/__init__.py +13 -4
  40. dstack/_internal/server/services/gateways/client.py +5 -3
  41. dstack/_internal/server/services/instances.py +8 -0
  42. dstack/_internal/server/services/jobs/__init__.py +45 -0
  43. dstack/_internal/server/services/jobs/configurators/base.py +7 -0
  44. dstack/_internal/server/services/locking.py +3 -1
  45. dstack/_internal/server/services/logging.py +4 -2
  46. dstack/_internal/server/services/logs/__init__.py +15 -2
  47. dstack/_internal/server/services/logs/aws.py +2 -4
  48. dstack/_internal/server/services/logs/filelog.py +33 -27
  49. dstack/_internal/server/services/logs/gcp.py +3 -5
  50. dstack/_internal/server/services/proxy/repo.py +4 -1
  51. dstack/_internal/server/services/runs.py +115 -32
  52. dstack/_internal/server/services/services/__init__.py +2 -1
  53. dstack/_internal/server/services/users.py +3 -1
  54. dstack/_internal/server/services/volumes.py +13 -0
  55. dstack/_internal/server/settings.py +7 -2
  56. dstack/_internal/server/statics/index.html +1 -1
  57. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-64f8273740c4b52c18f5.js} +6 -6
  58. dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
  59. dstack/_internal/server/testing/common.py +41 -5
  60. dstack/_internal/server/utils/routers.py +31 -8
  61. dstack/_internal/utils/json_utils.py +54 -0
  62. dstack/api/_public/runs.py +13 -2
  63. dstack/api/server/_runs.py +12 -2
  64. dstack/version.py +1 -1
  65. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/METADATA +7 -5
  66. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/RECORD +69 -66
  67. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
  68. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
  69. {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
@@ -25,6 +25,7 @@ from dstack._internal.core.errors import (
25
25
  ServerClientError,
26
26
  URLNotFoundError,
27
27
  )
28
+ from dstack._internal.core.models.common import ApplyAction
28
29
  from dstack._internal.core.models.configurations import ApplyConfigurationType
29
30
  from dstack._internal.core.models.fleets import (
30
31
  Fleet,
@@ -72,7 +73,104 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
72
73
  spec=spec,
73
74
  )
74
75
  _print_plan_header(plan)
76
+ if plan.action is not None:
77
+ self._apply_plan(plan, command_args)
78
+ else:
79
+ # Old servers don't support spec update
80
+ self._apply_plan_on_old_server(plan, command_args)
81
+
82
+ def _apply_plan(self, plan: FleetPlan, command_args: argparse.Namespace):
83
+ delete_fleet_name: Optional[str] = None
84
+ action_message = ""
85
+ confirm_message = ""
86
+ if plan.current_resource is None:
87
+ if plan.spec.configuration.name is not None:
88
+ action_message += (
89
+ f"Fleet [code]{plan.spec.configuration.name}[/] does not exist yet."
90
+ )
91
+ confirm_message += "Create the fleet?"
92
+ else:
93
+ action_message += f"Found fleet [code]{plan.spec.configuration.name}[/]."
94
+ if plan.action == ApplyAction.CREATE:
95
+ delete_fleet_name = plan.current_resource.name
96
+ action_message += (
97
+ " Configuration changes detected. Cannot update the fleet in-place"
98
+ )
99
+ confirm_message += "Re-create the fleet?"
100
+ elif plan.current_resource.spec == plan.effective_spec:
101
+ if command_args.yes and not command_args.force:
102
+ # --force is required only with --yes,
103
+ # otherwise we may ask for force apply interactively.
104
+ console.print(
105
+ "No configuration changes detected. Use --force to apply anyway."
106
+ )
107
+ return
108
+ delete_fleet_name = plan.current_resource.name
109
+ action_message += " No configuration changes detected."
110
+ confirm_message += "Re-create the fleet?"
111
+ else:
112
+ action_message += " Configuration changes detected."
113
+ confirm_message += "Update the fleet in-place?"
114
+
115
+ console.print(action_message)
116
+ if not command_args.yes and not confirm_ask(confirm_message):
117
+ console.print("\nExiting...")
118
+ return
119
+
120
+ if delete_fleet_name is not None:
121
+ with console.status("Deleting existing fleet..."):
122
+ self.api.client.fleets.delete(
123
+ project_name=self.api.project, names=[delete_fleet_name]
124
+ )
125
+ # Fleet deletion is async. Wait for fleet to be deleted.
126
+ while True:
127
+ try:
128
+ self.api.client.fleets.get(
129
+ project_name=self.api.project, name=delete_fleet_name
130
+ )
131
+ except ResourceNotExistsError:
132
+ break
133
+ else:
134
+ time.sleep(1)
135
+
136
+ try:
137
+ with console.status("Applying plan..."):
138
+ fleet = self.api.client.fleets.apply_plan(project_name=self.api.project, plan=plan)
139
+ except ServerClientError as e:
140
+ raise CLIError(e.msg)
141
+ if command_args.detach:
142
+ console.print("Fleet configuration submitted. Exiting...")
143
+ return
144
+ try:
145
+ with MultiItemStatus(
146
+ f"Provisioning [code]{fleet.name}[/]...", console=console
147
+ ) as live:
148
+ while not _finished_provisioning(fleet):
149
+ table = get_fleets_table([fleet])
150
+ live.update(table)
151
+ time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
152
+ fleet = self.api.client.fleets.get(self.api.project, fleet.name)
153
+ except KeyboardInterrupt:
154
+ if confirm_ask("Delete the fleet before exiting?"):
155
+ with console.status("Deleting fleet..."):
156
+ self.api.client.fleets.delete(
157
+ project_name=self.api.project, names=[fleet.name]
158
+ )
159
+ else:
160
+ console.print("Exiting... Fleet provisioning will continue in the background.")
161
+ return
162
+ console.print(
163
+ get_fleets_table(
164
+ [fleet],
165
+ verbose=_failed_provisioning(fleet),
166
+ format_date=local_time,
167
+ )
168
+ )
169
+ if _failed_provisioning(fleet):
170
+ console.print("\n[error]Some instances failed. Check the table above for errors.[/]")
171
+ exit(1)
75
172
 
173
+ def _apply_plan_on_old_server(self, plan: FleetPlan, command_args: argparse.Namespace):
76
174
  action_message = ""
77
175
  confirm_message = ""
78
176
  if plan.current_resource is None:
@@ -86,7 +184,7 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
86
184
  diff = diff_models(
87
185
  old=plan.current_resource.spec.configuration,
88
186
  new=plan.spec.configuration,
89
- ignore={
187
+ reset={
90
188
  "ssh_config": {
91
189
  "ssh_key": True,
92
190
  "proxy_jump": {"ssh_key"},
@@ -159,7 +159,7 @@ def apply_profile_args(
159
159
  if args.idle_duration is not None:
160
160
  profile_settings.idle_duration = args.idle_duration
161
161
  elif args.dont_destroy:
162
- profile_settings.idle_duration = False
162
+ profile_settings.idle_duration = "off"
163
163
  if args.creation_policy_reuse:
164
164
  profile_settings.creation_policy = CreationPolicy.REUSE
165
165
 
@@ -3,7 +3,16 @@ from typing import Optional
3
3
  from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType
4
4
  from dstack._internal.core.models.configurations import ServiceConfiguration
5
5
  from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSpec, JobSubmission, RunSpec
6
- from dstack._internal.server.schemas.runs import GetRunPlanRequest
6
+ from dstack._internal.server.schemas.runs import GetRunPlanRequest, ListRunsRequest
7
+
8
+
9
+ def get_list_runs_excludes(list_runs_request: ListRunsRequest) -> IncludeExcludeSetType:
10
+ excludes = set()
11
+ if list_runs_request.include_jobs:
12
+ excludes.add("include_jobs")
13
+ if list_runs_request.job_submissions_limit is None:
14
+ excludes.add("job_submissions_limit")
15
+ return excludes
7
16
 
8
17
 
9
18
  def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeDictType]:
@@ -139,6 +148,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
139
148
  spec_excludes["repo_data"] = True
140
149
  if all(not s.file_archives for s in job_specs):
141
150
  spec_excludes["file_archives"] = True
151
+ if all(s.service_port is None for s in job_specs):
152
+ spec_excludes["service_port"] = True
142
153
 
143
154
  return spec_excludes
144
155
 
@@ -30,4 +30,6 @@ def _get_volume_configuration_excludes(
30
30
  configuration_excludes: IncludeExcludeDictType = {}
31
31
  if configuration.tags is None:
32
32
  configuration_excludes["tags"] = True
33
+ if configuration.auto_cleanup_duration is None:
34
+ configuration_excludes["auto_cleanup_duration"] = True
33
35
  return configuration_excludes
@@ -1,11 +1,14 @@
1
1
  import re
2
2
  from enum import Enum
3
- from typing import Union
3
+ from typing import Any, Callable, Optional, Union
4
4
 
5
+ import orjson
5
6
  from pydantic import Field
6
7
  from pydantic_duality import DualBaseModel
7
8
  from typing_extensions import Annotated
8
9
 
10
+ from dstack._internal.utils.json_utils import pydantic_orjson_dumps
11
+
9
12
  IncludeExcludeFieldType = Union[int, str]
10
13
  IncludeExcludeSetType = set[IncludeExcludeFieldType]
11
14
  IncludeExcludeDictType = dict[
@@ -20,7 +23,40 @@ IncludeExcludeType = Union[IncludeExcludeSetType, IncludeExcludeDictType]
20
23
  # This allows to use the same model both for a strict parsing of the user input and
21
24
  # for a permissive parsing of the server responses.
22
25
  class CoreModel(DualBaseModel):
23
- pass
26
+ class Config:
27
+ json_loads = orjson.loads
28
+ json_dumps = pydantic_orjson_dumps
29
+
30
+ def json(
31
+ self,
32
+ *,
33
+ include: Optional[IncludeExcludeType] = None,
34
+ exclude: Optional[IncludeExcludeType] = None,
35
+ by_alias: bool = False,
36
+ skip_defaults: Optional[bool] = None, # ignore as it's deprecated
37
+ exclude_unset: bool = False,
38
+ exclude_defaults: bool = False,
39
+ exclude_none: bool = False,
40
+ encoder: Optional[Callable[[Any], Any]] = None,
41
+ models_as_dict: bool = True, # does not seems to be needed by dstack or dependencies
42
+ **dumps_kwargs: Any,
43
+ ) -> str:
44
+ """
45
+ Override `json()` method so that it calls `dict()`.
46
+ Allows changing how models are serialized by overriding `dict()` only.
47
+ By default, `json()` won't call `dict()`, so changes applied in `dict()` won't take place.
48
+ """
49
+ data = self.dict(
50
+ by_alias=by_alias,
51
+ include=include,
52
+ exclude=exclude,
53
+ exclude_unset=exclude_unset,
54
+ exclude_defaults=exclude_defaults,
55
+ exclude_none=exclude_none,
56
+ )
57
+ if self.__custom_root_type__:
58
+ data = data["__root__"]
59
+ return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
24
60
 
25
61
 
26
62
  class Duration(int):
@@ -4,6 +4,7 @@ from enum import Enum
4
4
  from pathlib import PurePosixPath
5
5
  from typing import Any, Dict, List, Optional, Union
6
6
 
7
+ import orjson
7
8
  from pydantic import Field, ValidationError, conint, constr, root_validator, validator
8
9
  from typing_extensions import Annotated, Literal
9
10
 
@@ -18,6 +19,9 @@ from dstack._internal.core.models.resources import Range, ResourcesSpec
18
19
  from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
19
20
  from dstack._internal.core.models.unix import UnixUser
20
21
  from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
22
+ from dstack._internal.utils.json_utils import (
23
+ pydantic_orjson_dumps_with_indent,
24
+ )
21
25
 
22
26
  CommandsList = List[str]
23
27
  ValidPort = conint(gt=0, le=65536)
@@ -394,8 +398,9 @@ class TaskConfiguration(
394
398
 
395
399
  class ServiceConfigurationParams(CoreModel):
396
400
  port: Annotated[
401
+ # NOTE: it's a PortMapping for historical reasons. Only `port.container_port` is used.
397
402
  Union[ValidPort, constr(regex=r"^[0-9]+:[0-9]+$"), PortMapping],
398
- Field(description="The port, that application listens on or the mapping"),
403
+ Field(description="The port the application listens on"),
399
404
  ]
400
405
  gateway: Annotated[
401
406
  Optional[Union[bool, str]],
@@ -573,6 +578,9 @@ class DstackConfiguration(CoreModel):
573
578
  ]
574
579
 
575
580
  class Config:
581
+ json_loads = orjson.loads
582
+ json_dumps = pydantic_orjson_dumps_with_indent
583
+
576
584
  @staticmethod
577
585
  def schema_extra(schema: Dict[str, Any]):
578
586
  schema["$schema"] = "http://json-schema.org/draft-07/schema#"
@@ -8,7 +8,7 @@ from pydantic import Field, root_validator, validator
8
8
  from typing_extensions import Annotated, Literal
9
9
 
10
10
  from dstack._internal.core.models.backends.base import BackendType
11
- from dstack._internal.core.models.common import CoreModel
11
+ from dstack._internal.core.models.common import ApplyAction, CoreModel
12
12
  from dstack._internal.core.models.envs import Env
13
13
  from dstack._internal.core.models.instances import Instance, InstanceOfferWithAvailability, SSHKey
14
14
  from dstack._internal.core.models.profiles import (
@@ -324,6 +324,7 @@ class FleetPlan(CoreModel):
324
324
  offers: List[InstanceOfferWithAvailability]
325
325
  total_offers: int
326
326
  max_offer_price: Optional[float] = None
327
+ action: Optional[ApplyAction] = None # default value for backward compatibility
327
328
 
328
329
  def get_effective_spec(self) -> FleetSpec:
329
330
  if self.effective_spec is not None:
@@ -1,12 +1,14 @@
1
1
  from enum import Enum
2
2
  from typing import Any, Dict, List, Optional, Union, overload
3
3
 
4
+ import orjson
4
5
  from pydantic import Field, root_validator, validator
5
6
  from typing_extensions import Annotated, Literal
6
7
 
7
8
  from dstack._internal.core.models.backends.base import BackendType
8
9
  from dstack._internal.core.models.common import CoreModel, Duration
9
10
  from dstack._internal.utils.common import list_enum_values_for_annotation
11
+ from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent
10
12
  from dstack._internal.utils.tags import tags_validator
11
13
 
12
14
  DEFAULT_RETRY_DURATION = 3600
@@ -74,11 +76,9 @@ def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str
74
76
  return parse_duration(v)
75
77
 
76
78
 
77
- def parse_idle_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
78
- if v is False:
79
+ def parse_idle_duration(v: Optional[Union[int, str]]) -> Optional[Union[str, int]]:
80
+ if v == "off" or v == -1:
79
81
  return -1
80
- if v is True:
81
- return None
82
82
  return parse_duration(v)
83
83
 
84
84
 
@@ -249,7 +249,7 @@ class ProfileParams(CoreModel):
249
249
  ),
250
250
  ] = None
251
251
  idle_duration: Annotated[
252
- Optional[Union[Literal["off"], str, int, bool]],
252
+ Optional[Union[Literal["off"], str, int]],
253
253
  Field(
254
254
  description=(
255
255
  "Time to wait before terminating idle instances."
@@ -343,6 +343,9 @@ class ProfilesConfig(CoreModel):
343
343
  profiles: List[Profile]
344
344
 
345
345
  class Config:
346
+ json_loads = orjson.loads
347
+ json_dumps = pydantic_orjson_dumps_with_indent
348
+
346
349
  schema_extra = {"$schema": "http://json-schema.org/draft-07/schema#"}
347
350
 
348
351
  def default(self) -> Optional[Profile]:
@@ -382,14 +382,6 @@ class ResourcesSpec(CoreModel):
382
382
  gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
383
383
  disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
384
384
 
385
- # TODO: Remove in 0.20. Added for backward compatibility.
386
- @root_validator
387
- def _post_validate(cls, values):
388
- cpu = values.get("cpu")
389
- if isinstance(cpu, CPUSpec) and cpu.arch in [None, gpuhunt.CPUArchitecture.X86]:
390
- values["cpu"] = cpu.count
391
- return values
392
-
393
385
  def pretty_format(self) -> str:
394
386
  # TODO: Remove in 0.20. Use self.cpu directly
395
387
  cpu = parse_obj_as(CPUSpec, self.cpu)
@@ -407,3 +399,18 @@ class ResourcesSpec(CoreModel):
407
399
  resources.update(disk_size=self.disk.size)
408
400
  res = pretty_resources(**resources)
409
401
  return res
402
+
403
+ def dict(self, *args, **kwargs) -> Dict:
404
+ # super() does not work with pydantic-duality
405
+ res = CoreModel.dict(self, *args, **kwargs)
406
+ self._update_serialized_cpu(res)
407
+ return res
408
+
409
+ # TODO: Remove in 0.20. Added for backward compatibility.
410
+ def _update_serialized_cpu(self, values: Dict):
411
+ cpu = values["cpu"]
412
+ if cpu:
413
+ arch = cpu.get("arch")
414
+ count = cpu.get("count")
415
+ if count and arch in [None, gpuhunt.CPUArchitecture.X86.value]:
416
+ values["cpu"] = count
@@ -11,6 +11,7 @@ from dstack._internal.core.models.configurations import (
11
11
  DEFAULT_REPO_DIR,
12
12
  AnyRunConfiguration,
13
13
  RunConfiguration,
14
+ ServiceConfiguration,
14
15
  )
15
16
  from dstack._internal.core.models.files import FileArchiveMapping
16
17
  from dstack._internal.core.models.instances import (
@@ -101,6 +102,14 @@ class RunTerminationReason(str, Enum):
101
102
  }
102
103
  return mapping[self]
103
104
 
105
+ def to_error(self) -> Optional[str]:
106
+ if self == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
107
+ return "retry limit exceeded"
108
+ elif self == RunTerminationReason.SERVER_ERROR:
109
+ return "server error"
110
+ else:
111
+ return None
112
+
104
113
 
105
114
  class JobTerminationReason(str, Enum):
106
115
  # Set by the server
@@ -162,6 +171,24 @@ class JobTerminationReason(str, Enum):
162
171
  default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
163
172
  return mapping.get(self, default)
164
173
 
174
+ def to_error(self) -> Optional[str]:
175
+ # Should return None for values that are already
176
+ # handled and shown in status_message.
177
+ error_mapping = {
178
+ JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
179
+ JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
180
+ JobTerminationReason.VOLUME_ERROR: "volume error",
181
+ JobTerminationReason.GATEWAY_ERROR: "gateway error",
182
+ JobTerminationReason.SCALED_DOWN: "scaled down",
183
+ JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
184
+ JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
185
+ JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
186
+ JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
187
+ JobTerminationReason.EXECUTOR_ERROR: "executor error",
188
+ JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
189
+ }
190
+ return error_mapping.get(self)
191
+
165
192
 
166
193
  class Requirements(CoreModel):
167
194
  # TODO: Make requirements' fields required
@@ -227,6 +254,8 @@ class JobSpec(CoreModel):
227
254
  # TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
228
255
  repo_code_hash: Optional[str] = None
229
256
  file_archives: list[FileArchiveMapping] = []
257
+ # None for non-services and pre-0.19.19 services. See `get_service_port`
258
+ service_port: Optional[int] = None
230
259
 
231
260
 
232
261
  class JobProvisioningData(CoreModel):
@@ -305,13 +334,12 @@ class JobSubmission(CoreModel):
305
334
  finished_at: Optional[datetime]
306
335
  inactivity_secs: Optional[int]
307
336
  status: JobStatus
337
+ status_message: str = "" # default for backward compatibility
308
338
  termination_reason: Optional[JobTerminationReason]
309
339
  termination_reason_message: Optional[str]
310
340
  exit_status: Optional[int]
311
341
  job_provisioning_data: Optional[JobProvisioningData]
312
342
  job_runtime_data: Optional[JobRuntimeData]
313
- # TODO: make status_message and error a computed field after migrating to pydanticV2
314
- status_message: Optional[str] = None
315
343
  error: Optional[str] = None
316
344
 
317
345
  @property
@@ -325,71 +353,6 @@ class JobSubmission(CoreModel):
325
353
  end_time = self.finished_at
326
354
  return end_time - self.submitted_at
327
355
 
328
- @root_validator
329
- def _status_message(cls, values) -> Dict:
330
- try:
331
- status = values["status"]
332
- termination_reason = values["termination_reason"]
333
- exit_code = values["exit_status"]
334
- except KeyError:
335
- return values
336
- values["status_message"] = JobSubmission._get_status_message(
337
- status=status,
338
- termination_reason=termination_reason,
339
- exit_status=exit_code,
340
- )
341
- return values
342
-
343
- @staticmethod
344
- def _get_status_message(
345
- status: JobStatus,
346
- termination_reason: Optional[JobTerminationReason],
347
- exit_status: Optional[int],
348
- ) -> str:
349
- if status == JobStatus.DONE:
350
- return "exited (0)"
351
- elif status == JobStatus.FAILED:
352
- if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
353
- return f"exited ({exit_status})"
354
- elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
355
- return "no offers"
356
- elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
357
- return "interrupted"
358
- else:
359
- return "error"
360
- elif status == JobStatus.TERMINATED:
361
- if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
362
- return "stopped"
363
- elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
364
- return "aborted"
365
- return status.value
366
-
367
- @root_validator
368
- def _error(cls, values) -> Dict:
369
- try:
370
- termination_reason = values["termination_reason"]
371
- except KeyError:
372
- return values
373
- values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
374
- return values
375
-
376
- @staticmethod
377
- def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
378
- error_mapping = {
379
- JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
380
- JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
381
- JobTerminationReason.VOLUME_ERROR: "volume error",
382
- JobTerminationReason.GATEWAY_ERROR: "gateway error",
383
- JobTerminationReason.SCALED_DOWN: "scaled down",
384
- JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
385
- JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
386
- JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
387
- JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
388
- JobTerminationReason.EXECUTOR_ERROR: "executor error",
389
- JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
390
- }
391
- return error_mapping.get(termination_reason)
392
-
393
356
 
394
357
  class Job(CoreModel):
395
358
  job_spec: JobSpec
@@ -524,85 +487,17 @@ class Run(CoreModel):
524
487
  submitted_at: datetime
525
488
  last_processed_at: datetime
526
489
  status: RunStatus
527
- status_message: Optional[str] = None
528
- termination_reason: Optional[RunTerminationReason]
490
+ status_message: str = "" # default for backward compatibility
491
+ termination_reason: Optional[RunTerminationReason] = None
529
492
  run_spec: RunSpec
530
493
  jobs: List[Job]
531
- latest_job_submission: Optional[JobSubmission]
494
+ latest_job_submission: Optional[JobSubmission] = None
532
495
  cost: float = 0
533
496
  service: Optional[ServiceSpec] = None
534
497
  deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
535
- # TODO: make error a computed field after migrating to pydanticV2
536
498
  error: Optional[str] = None
537
499
  deleted: Optional[bool] = None
538
500
 
539
- @root_validator
540
- def _error(cls, values) -> Dict:
541
- try:
542
- termination_reason = values["termination_reason"]
543
- except KeyError:
544
- return values
545
- values["error"] = Run._get_error(termination_reason=termination_reason)
546
- return values
547
-
548
- @staticmethod
549
- def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
550
- if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
551
- return "retry limit exceeded"
552
- elif termination_reason == RunTerminationReason.SERVER_ERROR:
553
- return "server error"
554
- else:
555
- return None
556
-
557
- @root_validator
558
- def _status_message(cls, values) -> Dict:
559
- try:
560
- status = values["status"]
561
- jobs: List[Job] = values["jobs"]
562
- retry_on_events = (
563
- jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
564
- )
565
- job_status = (
566
- jobs[0].job_submissions[-1].status
567
- if len(jobs) == 1 and jobs[0].job_submissions
568
- else None
569
- )
570
- termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
571
- except KeyError:
572
- return values
573
- values["status_message"] = Run._get_status_message(
574
- status=status,
575
- job_status=job_status,
576
- retry_on_events=retry_on_events,
577
- termination_reason=termination_reason,
578
- )
579
- return values
580
-
581
- @staticmethod
582
- def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
583
- for submission in reversed(job.job_submissions):
584
- if submission.termination_reason is not None:
585
- return submission.termination_reason
586
- return None
587
-
588
- @staticmethod
589
- def _get_status_message(
590
- status: RunStatus,
591
- job_status: Optional[JobStatus],
592
- retry_on_events: List[RetryEvent],
593
- termination_reason: Optional[JobTerminationReason],
594
- ) -> str:
595
- if job_status == JobStatus.PULLING:
596
- return "pulling"
597
- # Currently, `retrying` is shown only for `no-capacity` events
598
- if (
599
- status in [RunStatus.SUBMITTED, RunStatus.PENDING]
600
- and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
601
- and RetryEvent.NO_CAPACITY in retry_on_events
602
- ):
603
- return "retrying"
604
- return status.value
605
-
606
501
  def is_deployment_in_progress(self) -> bool:
607
502
  return any(
608
503
  not j.job_submissions[-1].status.is_finished()
@@ -658,3 +553,11 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
658
553
  SpotPolicy.ONDEMAND: False,
659
554
  }
660
555
  return policy_map[spot_policy]
556
+
557
+
558
+ def get_service_port(job_spec: JobSpec, configuration: ServiceConfiguration) -> int:
559
+ # Compatibility with pre-0.19.19 job specs that do not have the `service_port` property.
560
+ # TODO: drop when pre-0.19.19 jobs are no longer relevant.
561
+ if job_spec.service_port is None:
562
+ return configuration.port.container_port
563
+ return job_spec.service_port
@@ -9,6 +9,7 @@ from typing_extensions import Annotated, Self
9
9
 
10
10
  from dstack._internal.core.models.backends.base import BackendType
11
11
  from dstack._internal.core.models.common import CoreModel
12
+ from dstack._internal.core.models.profiles import parse_idle_duration
12
13
  from dstack._internal.core.models.resources import Memory
13
14
  from dstack._internal.utils.common import get_or_error
14
15
  from dstack._internal.utils.tags import tags_validator
@@ -44,6 +45,16 @@ class VolumeConfiguration(CoreModel):
44
45
  Optional[str],
45
46
  Field(description="The volume ID. Must be specified when registering external volumes"),
46
47
  ] = None
48
+ auto_cleanup_duration: Annotated[
49
+ Optional[Union[str, int]],
50
+ Field(
51
+ description=(
52
+ "Time to wait after volume is no longer used by any job before deleting it. "
53
+ "Defaults to keep the volume indefinitely. "
54
+ "Use the value 'off' or -1 to disable auto-cleanup."
55
+ )
56
+ ),
57
+ ] = None
47
58
  tags: Annotated[
48
59
  Optional[Dict[str, str]],
49
60
  Field(
@@ -56,6 +67,9 @@ class VolumeConfiguration(CoreModel):
56
67
  ] = None
57
68
 
58
69
  _validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator)
70
+ _validate_auto_cleanup_duration = validator(
71
+ "auto_cleanup_duration", pre=True, allow_reuse=True
72
+ )(parse_idle_duration)
59
73
 
60
74
  @property
61
75
  def size_gb(self) -> int: