dstack 0.19.18__py3-none-any.whl → 0.19.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/configurators/fleet.py +99 -1
- dstack/_internal/cli/services/profile.py +1 -1
- dstack/_internal/core/compatibility/runs.py +12 -1
- dstack/_internal/core/compatibility/volumes.py +2 -0
- dstack/_internal/core/models/common.py +38 -2
- dstack/_internal/core/models/configurations.py +9 -1
- dstack/_internal/core/models/fleets.py +2 -1
- dstack/_internal/core/models/profiles.py +8 -5
- dstack/_internal/core/models/resources.py +15 -8
- dstack/_internal/core/models/runs.py +41 -138
- dstack/_internal/core/models/volumes.py +14 -0
- dstack/_internal/core/services/diff.py +30 -10
- dstack/_internal/core/services/ssh/attach.py +2 -0
- dstack/_internal/server/app.py +17 -9
- dstack/_internal/server/background/__init__.py +5 -3
- dstack/_internal/server/background/tasks/process_gateways.py +46 -28
- dstack/_internal/server/background/tasks/process_idle_volumes.py +139 -0
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +2 -0
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +6 -6
- dstack/_internal/server/migrations/versions/d5863798bf41_add_volumemodel_last_job_processed_at.py +40 -0
- dstack/_internal/server/models.py +1 -0
- dstack/_internal/server/routers/backends.py +23 -16
- dstack/_internal/server/routers/files.py +7 -6
- dstack/_internal/server/routers/fleets.py +47 -36
- dstack/_internal/server/routers/gateways.py +27 -18
- dstack/_internal/server/routers/instances.py +18 -13
- dstack/_internal/server/routers/logs.py +7 -3
- dstack/_internal/server/routers/metrics.py +14 -8
- dstack/_internal/server/routers/projects.py +33 -22
- dstack/_internal/server/routers/repos.py +7 -6
- dstack/_internal/server/routers/runs.py +49 -28
- dstack/_internal/server/routers/secrets.py +20 -15
- dstack/_internal/server/routers/server.py +7 -4
- dstack/_internal/server/routers/users.py +22 -19
- dstack/_internal/server/routers/volumes.py +34 -25
- dstack/_internal/server/schemas/logs.py +2 -2
- dstack/_internal/server/schemas/runs.py +17 -5
- dstack/_internal/server/services/fleets.py +354 -72
- dstack/_internal/server/services/gateways/__init__.py +13 -4
- dstack/_internal/server/services/gateways/client.py +5 -3
- dstack/_internal/server/services/instances.py +8 -0
- dstack/_internal/server/services/jobs/__init__.py +45 -0
- dstack/_internal/server/services/jobs/configurators/base.py +7 -0
- dstack/_internal/server/services/locking.py +3 -1
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/__init__.py +15 -2
- dstack/_internal/server/services/logs/aws.py +2 -4
- dstack/_internal/server/services/logs/filelog.py +33 -27
- dstack/_internal/server/services/logs/gcp.py +3 -5
- dstack/_internal/server/services/proxy/repo.py +4 -1
- dstack/_internal/server/services/runs.py +115 -32
- dstack/_internal/server/services/services/__init__.py +2 -1
- dstack/_internal/server/services/users.py +3 -1
- dstack/_internal/server/services/volumes.py +13 -0
- dstack/_internal/server/settings.py +7 -2
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js → main-64f8273740c4b52c18f5.js} +6 -6
- dstack/_internal/server/statics/{main-d1ac2e8c38ed5f08a114.js.map → main-64f8273740c4b52c18f5.js.map} +1 -1
- dstack/_internal/server/testing/common.py +41 -5
- dstack/_internal/server/utils/routers.py +31 -8
- dstack/_internal/utils/json_utils.py +54 -0
- dstack/api/_public/runs.py +13 -2
- dstack/api/server/_runs.py +12 -2
- dstack/version.py +1 -1
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/METADATA +7 -5
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/RECORD +69 -66
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/WHEEL +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.18.dist-info → dstack-0.19.19.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -25,6 +25,7 @@ from dstack._internal.core.errors import (
|
|
|
25
25
|
ServerClientError,
|
|
26
26
|
URLNotFoundError,
|
|
27
27
|
)
|
|
28
|
+
from dstack._internal.core.models.common import ApplyAction
|
|
28
29
|
from dstack._internal.core.models.configurations import ApplyConfigurationType
|
|
29
30
|
from dstack._internal.core.models.fleets import (
|
|
30
31
|
Fleet,
|
|
@@ -72,7 +73,104 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
72
73
|
spec=spec,
|
|
73
74
|
)
|
|
74
75
|
_print_plan_header(plan)
|
|
76
|
+
if plan.action is not None:
|
|
77
|
+
self._apply_plan(plan, command_args)
|
|
78
|
+
else:
|
|
79
|
+
# Old servers don't support spec update
|
|
80
|
+
self._apply_plan_on_old_server(plan, command_args)
|
|
81
|
+
|
|
82
|
+
def _apply_plan(self, plan: FleetPlan, command_args: argparse.Namespace):
|
|
83
|
+
delete_fleet_name: Optional[str] = None
|
|
84
|
+
action_message = ""
|
|
85
|
+
confirm_message = ""
|
|
86
|
+
if plan.current_resource is None:
|
|
87
|
+
if plan.spec.configuration.name is not None:
|
|
88
|
+
action_message += (
|
|
89
|
+
f"Fleet [code]{plan.spec.configuration.name}[/] does not exist yet."
|
|
90
|
+
)
|
|
91
|
+
confirm_message += "Create the fleet?"
|
|
92
|
+
else:
|
|
93
|
+
action_message += f"Found fleet [code]{plan.spec.configuration.name}[/]."
|
|
94
|
+
if plan.action == ApplyAction.CREATE:
|
|
95
|
+
delete_fleet_name = plan.current_resource.name
|
|
96
|
+
action_message += (
|
|
97
|
+
" Configuration changes detected. Cannot update the fleet in-place"
|
|
98
|
+
)
|
|
99
|
+
confirm_message += "Re-create the fleet?"
|
|
100
|
+
elif plan.current_resource.spec == plan.effective_spec:
|
|
101
|
+
if command_args.yes and not command_args.force:
|
|
102
|
+
# --force is required only with --yes,
|
|
103
|
+
# otherwise we may ask for force apply interactively.
|
|
104
|
+
console.print(
|
|
105
|
+
"No configuration changes detected. Use --force to apply anyway."
|
|
106
|
+
)
|
|
107
|
+
return
|
|
108
|
+
delete_fleet_name = plan.current_resource.name
|
|
109
|
+
action_message += " No configuration changes detected."
|
|
110
|
+
confirm_message += "Re-create the fleet?"
|
|
111
|
+
else:
|
|
112
|
+
action_message += " Configuration changes detected."
|
|
113
|
+
confirm_message += "Update the fleet in-place?"
|
|
114
|
+
|
|
115
|
+
console.print(action_message)
|
|
116
|
+
if not command_args.yes and not confirm_ask(confirm_message):
|
|
117
|
+
console.print("\nExiting...")
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
if delete_fleet_name is not None:
|
|
121
|
+
with console.status("Deleting existing fleet..."):
|
|
122
|
+
self.api.client.fleets.delete(
|
|
123
|
+
project_name=self.api.project, names=[delete_fleet_name]
|
|
124
|
+
)
|
|
125
|
+
# Fleet deletion is async. Wait for fleet to be deleted.
|
|
126
|
+
while True:
|
|
127
|
+
try:
|
|
128
|
+
self.api.client.fleets.get(
|
|
129
|
+
project_name=self.api.project, name=delete_fleet_name
|
|
130
|
+
)
|
|
131
|
+
except ResourceNotExistsError:
|
|
132
|
+
break
|
|
133
|
+
else:
|
|
134
|
+
time.sleep(1)
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
with console.status("Applying plan..."):
|
|
138
|
+
fleet = self.api.client.fleets.apply_plan(project_name=self.api.project, plan=plan)
|
|
139
|
+
except ServerClientError as e:
|
|
140
|
+
raise CLIError(e.msg)
|
|
141
|
+
if command_args.detach:
|
|
142
|
+
console.print("Fleet configuration submitted. Exiting...")
|
|
143
|
+
return
|
|
144
|
+
try:
|
|
145
|
+
with MultiItemStatus(
|
|
146
|
+
f"Provisioning [code]{fleet.name}[/]...", console=console
|
|
147
|
+
) as live:
|
|
148
|
+
while not _finished_provisioning(fleet):
|
|
149
|
+
table = get_fleets_table([fleet])
|
|
150
|
+
live.update(table)
|
|
151
|
+
time.sleep(LIVE_TABLE_PROVISION_INTERVAL_SECS)
|
|
152
|
+
fleet = self.api.client.fleets.get(self.api.project, fleet.name)
|
|
153
|
+
except KeyboardInterrupt:
|
|
154
|
+
if confirm_ask("Delete the fleet before exiting?"):
|
|
155
|
+
with console.status("Deleting fleet..."):
|
|
156
|
+
self.api.client.fleets.delete(
|
|
157
|
+
project_name=self.api.project, names=[fleet.name]
|
|
158
|
+
)
|
|
159
|
+
else:
|
|
160
|
+
console.print("Exiting... Fleet provisioning will continue in the background.")
|
|
161
|
+
return
|
|
162
|
+
console.print(
|
|
163
|
+
get_fleets_table(
|
|
164
|
+
[fleet],
|
|
165
|
+
verbose=_failed_provisioning(fleet),
|
|
166
|
+
format_date=local_time,
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
if _failed_provisioning(fleet):
|
|
170
|
+
console.print("\n[error]Some instances failed. Check the table above for errors.[/]")
|
|
171
|
+
exit(1)
|
|
75
172
|
|
|
173
|
+
def _apply_plan_on_old_server(self, plan: FleetPlan, command_args: argparse.Namespace):
|
|
76
174
|
action_message = ""
|
|
77
175
|
confirm_message = ""
|
|
78
176
|
if plan.current_resource is None:
|
|
@@ -86,7 +184,7 @@ class FleetConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
86
184
|
diff = diff_models(
|
|
87
185
|
old=plan.current_resource.spec.configuration,
|
|
88
186
|
new=plan.spec.configuration,
|
|
89
|
-
|
|
187
|
+
reset={
|
|
90
188
|
"ssh_config": {
|
|
91
189
|
"ssh_key": True,
|
|
92
190
|
"proxy_jump": {"ssh_key"},
|
|
@@ -159,7 +159,7 @@ def apply_profile_args(
|
|
|
159
159
|
if args.idle_duration is not None:
|
|
160
160
|
profile_settings.idle_duration = args.idle_duration
|
|
161
161
|
elif args.dont_destroy:
|
|
162
|
-
profile_settings.idle_duration =
|
|
162
|
+
profile_settings.idle_duration = "off"
|
|
163
163
|
if args.creation_policy_reuse:
|
|
164
164
|
profile_settings.creation_policy = CreationPolicy.REUSE
|
|
165
165
|
|
|
@@ -3,7 +3,16 @@ from typing import Optional
|
|
|
3
3
|
from dstack._internal.core.models.common import IncludeExcludeDictType, IncludeExcludeSetType
|
|
4
4
|
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
5
5
|
from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSpec, JobSubmission, RunSpec
|
|
6
|
-
from dstack._internal.server.schemas.runs import GetRunPlanRequest
|
|
6
|
+
from dstack._internal.server.schemas.runs import GetRunPlanRequest, ListRunsRequest
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_list_runs_excludes(list_runs_request: ListRunsRequest) -> IncludeExcludeSetType:
|
|
10
|
+
excludes = set()
|
|
11
|
+
if list_runs_request.include_jobs:
|
|
12
|
+
excludes.add("include_jobs")
|
|
13
|
+
if list_runs_request.job_submissions_limit is None:
|
|
14
|
+
excludes.add("job_submissions_limit")
|
|
15
|
+
return excludes
|
|
7
16
|
|
|
8
17
|
|
|
9
18
|
def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeDictType]:
|
|
@@ -139,6 +148,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
|
|
|
139
148
|
spec_excludes["repo_data"] = True
|
|
140
149
|
if all(not s.file_archives for s in job_specs):
|
|
141
150
|
spec_excludes["file_archives"] = True
|
|
151
|
+
if all(s.service_port is None for s in job_specs):
|
|
152
|
+
spec_excludes["service_port"] = True
|
|
142
153
|
|
|
143
154
|
return spec_excludes
|
|
144
155
|
|
|
@@ -30,4 +30,6 @@ def _get_volume_configuration_excludes(
|
|
|
30
30
|
configuration_excludes: IncludeExcludeDictType = {}
|
|
31
31
|
if configuration.tags is None:
|
|
32
32
|
configuration_excludes["tags"] = True
|
|
33
|
+
if configuration.auto_cleanup_duration is None:
|
|
34
|
+
configuration_excludes["auto_cleanup_duration"] = True
|
|
33
35
|
return configuration_excludes
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Union
|
|
3
|
+
from typing import Any, Callable, Optional, Union
|
|
4
4
|
|
|
5
|
+
import orjson
|
|
5
6
|
from pydantic import Field
|
|
6
7
|
from pydantic_duality import DualBaseModel
|
|
7
8
|
from typing_extensions import Annotated
|
|
8
9
|
|
|
10
|
+
from dstack._internal.utils.json_utils import pydantic_orjson_dumps
|
|
11
|
+
|
|
9
12
|
IncludeExcludeFieldType = Union[int, str]
|
|
10
13
|
IncludeExcludeSetType = set[IncludeExcludeFieldType]
|
|
11
14
|
IncludeExcludeDictType = dict[
|
|
@@ -20,7 +23,40 @@ IncludeExcludeType = Union[IncludeExcludeSetType, IncludeExcludeDictType]
|
|
|
20
23
|
# This allows to use the same model both for a strict parsing of the user input and
|
|
21
24
|
# for a permissive parsing of the server responses.
|
|
22
25
|
class CoreModel(DualBaseModel):
|
|
23
|
-
|
|
26
|
+
class Config:
|
|
27
|
+
json_loads = orjson.loads
|
|
28
|
+
json_dumps = pydantic_orjson_dumps
|
|
29
|
+
|
|
30
|
+
def json(
|
|
31
|
+
self,
|
|
32
|
+
*,
|
|
33
|
+
include: Optional[IncludeExcludeType] = None,
|
|
34
|
+
exclude: Optional[IncludeExcludeType] = None,
|
|
35
|
+
by_alias: bool = False,
|
|
36
|
+
skip_defaults: Optional[bool] = None, # ignore as it's deprecated
|
|
37
|
+
exclude_unset: bool = False,
|
|
38
|
+
exclude_defaults: bool = False,
|
|
39
|
+
exclude_none: bool = False,
|
|
40
|
+
encoder: Optional[Callable[[Any], Any]] = None,
|
|
41
|
+
models_as_dict: bool = True, # does not seems to be needed by dstack or dependencies
|
|
42
|
+
**dumps_kwargs: Any,
|
|
43
|
+
) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Override `json()` method so that it calls `dict()`.
|
|
46
|
+
Allows changing how models are serialized by overriding `dict()` only.
|
|
47
|
+
By default, `json()` won't call `dict()`, so changes applied in `dict()` won't take place.
|
|
48
|
+
"""
|
|
49
|
+
data = self.dict(
|
|
50
|
+
by_alias=by_alias,
|
|
51
|
+
include=include,
|
|
52
|
+
exclude=exclude,
|
|
53
|
+
exclude_unset=exclude_unset,
|
|
54
|
+
exclude_defaults=exclude_defaults,
|
|
55
|
+
exclude_none=exclude_none,
|
|
56
|
+
)
|
|
57
|
+
if self.__custom_root_type__:
|
|
58
|
+
data = data["__root__"]
|
|
59
|
+
return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
|
|
24
60
|
|
|
25
61
|
|
|
26
62
|
class Duration(int):
|
|
@@ -4,6 +4,7 @@ from enum import Enum
|
|
|
4
4
|
from pathlib import PurePosixPath
|
|
5
5
|
from typing import Any, Dict, List, Optional, Union
|
|
6
6
|
|
|
7
|
+
import orjson
|
|
7
8
|
from pydantic import Field, ValidationError, conint, constr, root_validator, validator
|
|
8
9
|
from typing_extensions import Annotated, Literal
|
|
9
10
|
|
|
@@ -18,6 +19,9 @@ from dstack._internal.core.models.resources import Range, ResourcesSpec
|
|
|
18
19
|
from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
|
|
19
20
|
from dstack._internal.core.models.unix import UnixUser
|
|
20
21
|
from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
|
|
22
|
+
from dstack._internal.utils.json_utils import (
|
|
23
|
+
pydantic_orjson_dumps_with_indent,
|
|
24
|
+
)
|
|
21
25
|
|
|
22
26
|
CommandsList = List[str]
|
|
23
27
|
ValidPort = conint(gt=0, le=65536)
|
|
@@ -394,8 +398,9 @@ class TaskConfiguration(
|
|
|
394
398
|
|
|
395
399
|
class ServiceConfigurationParams(CoreModel):
|
|
396
400
|
port: Annotated[
|
|
401
|
+
# NOTE: it's a PortMapping for historical reasons. Only `port.container_port` is used.
|
|
397
402
|
Union[ValidPort, constr(regex=r"^[0-9]+:[0-9]+$"), PortMapping],
|
|
398
|
-
Field(description="The port
|
|
403
|
+
Field(description="The port the application listens on"),
|
|
399
404
|
]
|
|
400
405
|
gateway: Annotated[
|
|
401
406
|
Optional[Union[bool, str]],
|
|
@@ -573,6 +578,9 @@ class DstackConfiguration(CoreModel):
|
|
|
573
578
|
]
|
|
574
579
|
|
|
575
580
|
class Config:
|
|
581
|
+
json_loads = orjson.loads
|
|
582
|
+
json_dumps = pydantic_orjson_dumps_with_indent
|
|
583
|
+
|
|
576
584
|
@staticmethod
|
|
577
585
|
def schema_extra(schema: Dict[str, Any]):
|
|
578
586
|
schema["$schema"] = "http://json-schema.org/draft-07/schema#"
|
|
@@ -8,7 +8,7 @@ from pydantic import Field, root_validator, validator
|
|
|
8
8
|
from typing_extensions import Annotated, Literal
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.models.backends.base import BackendType
|
|
11
|
-
from dstack._internal.core.models.common import CoreModel
|
|
11
|
+
from dstack._internal.core.models.common import ApplyAction, CoreModel
|
|
12
12
|
from dstack._internal.core.models.envs import Env
|
|
13
13
|
from dstack._internal.core.models.instances import Instance, InstanceOfferWithAvailability, SSHKey
|
|
14
14
|
from dstack._internal.core.models.profiles import (
|
|
@@ -324,6 +324,7 @@ class FleetPlan(CoreModel):
|
|
|
324
324
|
offers: List[InstanceOfferWithAvailability]
|
|
325
325
|
total_offers: int
|
|
326
326
|
max_offer_price: Optional[float] = None
|
|
327
|
+
action: Optional[ApplyAction] = None # default value for backward compatibility
|
|
327
328
|
|
|
328
329
|
def get_effective_spec(self) -> FleetSpec:
|
|
329
330
|
if self.effective_spec is not None:
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from typing import Any, Dict, List, Optional, Union, overload
|
|
3
3
|
|
|
4
|
+
import orjson
|
|
4
5
|
from pydantic import Field, root_validator, validator
|
|
5
6
|
from typing_extensions import Annotated, Literal
|
|
6
7
|
|
|
7
8
|
from dstack._internal.core.models.backends.base import BackendType
|
|
8
9
|
from dstack._internal.core.models.common import CoreModel, Duration
|
|
9
10
|
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
11
|
+
from dstack._internal.utils.json_utils import pydantic_orjson_dumps_with_indent
|
|
10
12
|
from dstack._internal.utils.tags import tags_validator
|
|
11
13
|
|
|
12
14
|
DEFAULT_RETRY_DURATION = 3600
|
|
@@ -74,11 +76,9 @@ def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str
|
|
|
74
76
|
return parse_duration(v)
|
|
75
77
|
|
|
76
78
|
|
|
77
|
-
def parse_idle_duration(v: Optional[Union[int, str
|
|
78
|
-
if v
|
|
79
|
+
def parse_idle_duration(v: Optional[Union[int, str]]) -> Optional[Union[str, int]]:
|
|
80
|
+
if v == "off" or v == -1:
|
|
79
81
|
return -1
|
|
80
|
-
if v is True:
|
|
81
|
-
return None
|
|
82
82
|
return parse_duration(v)
|
|
83
83
|
|
|
84
84
|
|
|
@@ -249,7 +249,7 @@ class ProfileParams(CoreModel):
|
|
|
249
249
|
),
|
|
250
250
|
] = None
|
|
251
251
|
idle_duration: Annotated[
|
|
252
|
-
Optional[Union[Literal["off"], str, int
|
|
252
|
+
Optional[Union[Literal["off"], str, int]],
|
|
253
253
|
Field(
|
|
254
254
|
description=(
|
|
255
255
|
"Time to wait before terminating idle instances."
|
|
@@ -343,6 +343,9 @@ class ProfilesConfig(CoreModel):
|
|
|
343
343
|
profiles: List[Profile]
|
|
344
344
|
|
|
345
345
|
class Config:
|
|
346
|
+
json_loads = orjson.loads
|
|
347
|
+
json_dumps = pydantic_orjson_dumps_with_indent
|
|
348
|
+
|
|
346
349
|
schema_extra = {"$schema": "http://json-schema.org/draft-07/schema#"}
|
|
347
350
|
|
|
348
351
|
def default(self) -> Optional[Profile]:
|
|
@@ -382,14 +382,6 @@ class ResourcesSpec(CoreModel):
|
|
|
382
382
|
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
|
|
383
383
|
disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
|
|
384
384
|
|
|
385
|
-
# TODO: Remove in 0.20. Added for backward compatibility.
|
|
386
|
-
@root_validator
|
|
387
|
-
def _post_validate(cls, values):
|
|
388
|
-
cpu = values.get("cpu")
|
|
389
|
-
if isinstance(cpu, CPUSpec) and cpu.arch in [None, gpuhunt.CPUArchitecture.X86]:
|
|
390
|
-
values["cpu"] = cpu.count
|
|
391
|
-
return values
|
|
392
|
-
|
|
393
385
|
def pretty_format(self) -> str:
|
|
394
386
|
# TODO: Remove in 0.20. Use self.cpu directly
|
|
395
387
|
cpu = parse_obj_as(CPUSpec, self.cpu)
|
|
@@ -407,3 +399,18 @@ class ResourcesSpec(CoreModel):
|
|
|
407
399
|
resources.update(disk_size=self.disk.size)
|
|
408
400
|
res = pretty_resources(**resources)
|
|
409
401
|
return res
|
|
402
|
+
|
|
403
|
+
def dict(self, *args, **kwargs) -> Dict:
|
|
404
|
+
# super() does not work with pydantic-duality
|
|
405
|
+
res = CoreModel.dict(self, *args, **kwargs)
|
|
406
|
+
self._update_serialized_cpu(res)
|
|
407
|
+
return res
|
|
408
|
+
|
|
409
|
+
# TODO: Remove in 0.20. Added for backward compatibility.
|
|
410
|
+
def _update_serialized_cpu(self, values: Dict):
|
|
411
|
+
cpu = values["cpu"]
|
|
412
|
+
if cpu:
|
|
413
|
+
arch = cpu.get("arch")
|
|
414
|
+
count = cpu.get("count")
|
|
415
|
+
if count and arch in [None, gpuhunt.CPUArchitecture.X86.value]:
|
|
416
|
+
values["cpu"] = count
|
|
@@ -11,6 +11,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
11
11
|
DEFAULT_REPO_DIR,
|
|
12
12
|
AnyRunConfiguration,
|
|
13
13
|
RunConfiguration,
|
|
14
|
+
ServiceConfiguration,
|
|
14
15
|
)
|
|
15
16
|
from dstack._internal.core.models.files import FileArchiveMapping
|
|
16
17
|
from dstack._internal.core.models.instances import (
|
|
@@ -101,6 +102,14 @@ class RunTerminationReason(str, Enum):
|
|
|
101
102
|
}
|
|
102
103
|
return mapping[self]
|
|
103
104
|
|
|
105
|
+
def to_error(self) -> Optional[str]:
|
|
106
|
+
if self == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
|
|
107
|
+
return "retry limit exceeded"
|
|
108
|
+
elif self == RunTerminationReason.SERVER_ERROR:
|
|
109
|
+
return "server error"
|
|
110
|
+
else:
|
|
111
|
+
return None
|
|
112
|
+
|
|
104
113
|
|
|
105
114
|
class JobTerminationReason(str, Enum):
|
|
106
115
|
# Set by the server
|
|
@@ -162,6 +171,24 @@ class JobTerminationReason(str, Enum):
|
|
|
162
171
|
default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
|
|
163
172
|
return mapping.get(self, default)
|
|
164
173
|
|
|
174
|
+
def to_error(self) -> Optional[str]:
|
|
175
|
+
# Should return None for values that are already
|
|
176
|
+
# handled and shown in status_message.
|
|
177
|
+
error_mapping = {
|
|
178
|
+
JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
|
|
179
|
+
JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
|
|
180
|
+
JobTerminationReason.VOLUME_ERROR: "volume error",
|
|
181
|
+
JobTerminationReason.GATEWAY_ERROR: "gateway error",
|
|
182
|
+
JobTerminationReason.SCALED_DOWN: "scaled down",
|
|
183
|
+
JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
|
|
184
|
+
JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
|
|
185
|
+
JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
|
|
186
|
+
JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
|
|
187
|
+
JobTerminationReason.EXECUTOR_ERROR: "executor error",
|
|
188
|
+
JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
|
|
189
|
+
}
|
|
190
|
+
return error_mapping.get(self)
|
|
191
|
+
|
|
165
192
|
|
|
166
193
|
class Requirements(CoreModel):
|
|
167
194
|
# TODO: Make requirements' fields required
|
|
@@ -227,6 +254,8 @@ class JobSpec(CoreModel):
|
|
|
227
254
|
# TODO: drop this comment when supporting jobs submitted before 0.19.17 is no longer relevant.
|
|
228
255
|
repo_code_hash: Optional[str] = None
|
|
229
256
|
file_archives: list[FileArchiveMapping] = []
|
|
257
|
+
# None for non-services and pre-0.19.19 services. See `get_service_port`
|
|
258
|
+
service_port: Optional[int] = None
|
|
230
259
|
|
|
231
260
|
|
|
232
261
|
class JobProvisioningData(CoreModel):
|
|
@@ -305,13 +334,12 @@ class JobSubmission(CoreModel):
|
|
|
305
334
|
finished_at: Optional[datetime]
|
|
306
335
|
inactivity_secs: Optional[int]
|
|
307
336
|
status: JobStatus
|
|
337
|
+
status_message: str = "" # default for backward compatibility
|
|
308
338
|
termination_reason: Optional[JobTerminationReason]
|
|
309
339
|
termination_reason_message: Optional[str]
|
|
310
340
|
exit_status: Optional[int]
|
|
311
341
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
312
342
|
job_runtime_data: Optional[JobRuntimeData]
|
|
313
|
-
# TODO: make status_message and error a computed field after migrating to pydanticV2
|
|
314
|
-
status_message: Optional[str] = None
|
|
315
343
|
error: Optional[str] = None
|
|
316
344
|
|
|
317
345
|
@property
|
|
@@ -325,71 +353,6 @@ class JobSubmission(CoreModel):
|
|
|
325
353
|
end_time = self.finished_at
|
|
326
354
|
return end_time - self.submitted_at
|
|
327
355
|
|
|
328
|
-
@root_validator
|
|
329
|
-
def _status_message(cls, values) -> Dict:
|
|
330
|
-
try:
|
|
331
|
-
status = values["status"]
|
|
332
|
-
termination_reason = values["termination_reason"]
|
|
333
|
-
exit_code = values["exit_status"]
|
|
334
|
-
except KeyError:
|
|
335
|
-
return values
|
|
336
|
-
values["status_message"] = JobSubmission._get_status_message(
|
|
337
|
-
status=status,
|
|
338
|
-
termination_reason=termination_reason,
|
|
339
|
-
exit_status=exit_code,
|
|
340
|
-
)
|
|
341
|
-
return values
|
|
342
|
-
|
|
343
|
-
@staticmethod
|
|
344
|
-
def _get_status_message(
|
|
345
|
-
status: JobStatus,
|
|
346
|
-
termination_reason: Optional[JobTerminationReason],
|
|
347
|
-
exit_status: Optional[int],
|
|
348
|
-
) -> str:
|
|
349
|
-
if status == JobStatus.DONE:
|
|
350
|
-
return "exited (0)"
|
|
351
|
-
elif status == JobStatus.FAILED:
|
|
352
|
-
if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
353
|
-
return f"exited ({exit_status})"
|
|
354
|
-
elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
|
|
355
|
-
return "no offers"
|
|
356
|
-
elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
357
|
-
return "interrupted"
|
|
358
|
-
else:
|
|
359
|
-
return "error"
|
|
360
|
-
elif status == JobStatus.TERMINATED:
|
|
361
|
-
if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
362
|
-
return "stopped"
|
|
363
|
-
elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
364
|
-
return "aborted"
|
|
365
|
-
return status.value
|
|
366
|
-
|
|
367
|
-
@root_validator
|
|
368
|
-
def _error(cls, values) -> Dict:
|
|
369
|
-
try:
|
|
370
|
-
termination_reason = values["termination_reason"]
|
|
371
|
-
except KeyError:
|
|
372
|
-
return values
|
|
373
|
-
values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
|
|
374
|
-
return values
|
|
375
|
-
|
|
376
|
-
@staticmethod
|
|
377
|
-
def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
|
|
378
|
-
error_mapping = {
|
|
379
|
-
JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
|
|
380
|
-
JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
|
|
381
|
-
JobTerminationReason.VOLUME_ERROR: "volume error",
|
|
382
|
-
JobTerminationReason.GATEWAY_ERROR: "gateway error",
|
|
383
|
-
JobTerminationReason.SCALED_DOWN: "scaled down",
|
|
384
|
-
JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
|
|
385
|
-
JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
|
|
386
|
-
JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
|
|
387
|
-
JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
|
|
388
|
-
JobTerminationReason.EXECUTOR_ERROR: "executor error",
|
|
389
|
-
JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
|
|
390
|
-
}
|
|
391
|
-
return error_mapping.get(termination_reason)
|
|
392
|
-
|
|
393
356
|
|
|
394
357
|
class Job(CoreModel):
|
|
395
358
|
job_spec: JobSpec
|
|
@@ -524,85 +487,17 @@ class Run(CoreModel):
|
|
|
524
487
|
submitted_at: datetime
|
|
525
488
|
last_processed_at: datetime
|
|
526
489
|
status: RunStatus
|
|
527
|
-
status_message:
|
|
528
|
-
termination_reason: Optional[RunTerminationReason]
|
|
490
|
+
status_message: str = "" # default for backward compatibility
|
|
491
|
+
termination_reason: Optional[RunTerminationReason] = None
|
|
529
492
|
run_spec: RunSpec
|
|
530
493
|
jobs: List[Job]
|
|
531
|
-
latest_job_submission: Optional[JobSubmission]
|
|
494
|
+
latest_job_submission: Optional[JobSubmission] = None
|
|
532
495
|
cost: float = 0
|
|
533
496
|
service: Optional[ServiceSpec] = None
|
|
534
497
|
deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
|
|
535
|
-
# TODO: make error a computed field after migrating to pydanticV2
|
|
536
498
|
error: Optional[str] = None
|
|
537
499
|
deleted: Optional[bool] = None
|
|
538
500
|
|
|
539
|
-
@root_validator
|
|
540
|
-
def _error(cls, values) -> Dict:
|
|
541
|
-
try:
|
|
542
|
-
termination_reason = values["termination_reason"]
|
|
543
|
-
except KeyError:
|
|
544
|
-
return values
|
|
545
|
-
values["error"] = Run._get_error(termination_reason=termination_reason)
|
|
546
|
-
return values
|
|
547
|
-
|
|
548
|
-
@staticmethod
|
|
549
|
-
def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
|
|
550
|
-
if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
|
|
551
|
-
return "retry limit exceeded"
|
|
552
|
-
elif termination_reason == RunTerminationReason.SERVER_ERROR:
|
|
553
|
-
return "server error"
|
|
554
|
-
else:
|
|
555
|
-
return None
|
|
556
|
-
|
|
557
|
-
@root_validator
|
|
558
|
-
def _status_message(cls, values) -> Dict:
|
|
559
|
-
try:
|
|
560
|
-
status = values["status"]
|
|
561
|
-
jobs: List[Job] = values["jobs"]
|
|
562
|
-
retry_on_events = (
|
|
563
|
-
jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
|
|
564
|
-
)
|
|
565
|
-
job_status = (
|
|
566
|
-
jobs[0].job_submissions[-1].status
|
|
567
|
-
if len(jobs) == 1 and jobs[0].job_submissions
|
|
568
|
-
else None
|
|
569
|
-
)
|
|
570
|
-
termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
|
|
571
|
-
except KeyError:
|
|
572
|
-
return values
|
|
573
|
-
values["status_message"] = Run._get_status_message(
|
|
574
|
-
status=status,
|
|
575
|
-
job_status=job_status,
|
|
576
|
-
retry_on_events=retry_on_events,
|
|
577
|
-
termination_reason=termination_reason,
|
|
578
|
-
)
|
|
579
|
-
return values
|
|
580
|
-
|
|
581
|
-
@staticmethod
|
|
582
|
-
def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
|
|
583
|
-
for submission in reversed(job.job_submissions):
|
|
584
|
-
if submission.termination_reason is not None:
|
|
585
|
-
return submission.termination_reason
|
|
586
|
-
return None
|
|
587
|
-
|
|
588
|
-
@staticmethod
|
|
589
|
-
def _get_status_message(
|
|
590
|
-
status: RunStatus,
|
|
591
|
-
job_status: Optional[JobStatus],
|
|
592
|
-
retry_on_events: List[RetryEvent],
|
|
593
|
-
termination_reason: Optional[JobTerminationReason],
|
|
594
|
-
) -> str:
|
|
595
|
-
if job_status == JobStatus.PULLING:
|
|
596
|
-
return "pulling"
|
|
597
|
-
# Currently, `retrying` is shown only for `no-capacity` events
|
|
598
|
-
if (
|
|
599
|
-
status in [RunStatus.SUBMITTED, RunStatus.PENDING]
|
|
600
|
-
and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
601
|
-
and RetryEvent.NO_CAPACITY in retry_on_events
|
|
602
|
-
):
|
|
603
|
-
return "retrying"
|
|
604
|
-
return status.value
|
|
605
|
-
|
|
606
501
|
def is_deployment_in_progress(self) -> bool:
|
|
607
502
|
return any(
|
|
608
503
|
not j.job_submissions[-1].status.is_finished()
|
|
@@ -658,3 +553,11 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
|
|
|
658
553
|
SpotPolicy.ONDEMAND: False,
|
|
659
554
|
}
|
|
660
555
|
return policy_map[spot_policy]
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def get_service_port(job_spec: JobSpec, configuration: ServiceConfiguration) -> int:
|
|
559
|
+
# Compatibility with pre-0.19.19 job specs that do not have the `service_port` property.
|
|
560
|
+
# TODO: drop when pre-0.19.19 jobs are no longer relevant.
|
|
561
|
+
if job_spec.service_port is None:
|
|
562
|
+
return configuration.port.container_port
|
|
563
|
+
return job_spec.service_port
|
|
@@ -9,6 +9,7 @@ from typing_extensions import Annotated, Self
|
|
|
9
9
|
|
|
10
10
|
from dstack._internal.core.models.backends.base import BackendType
|
|
11
11
|
from dstack._internal.core.models.common import CoreModel
|
|
12
|
+
from dstack._internal.core.models.profiles import parse_idle_duration
|
|
12
13
|
from dstack._internal.core.models.resources import Memory
|
|
13
14
|
from dstack._internal.utils.common import get_or_error
|
|
14
15
|
from dstack._internal.utils.tags import tags_validator
|
|
@@ -44,6 +45,16 @@ class VolumeConfiguration(CoreModel):
|
|
|
44
45
|
Optional[str],
|
|
45
46
|
Field(description="The volume ID. Must be specified when registering external volumes"),
|
|
46
47
|
] = None
|
|
48
|
+
auto_cleanup_duration: Annotated[
|
|
49
|
+
Optional[Union[str, int]],
|
|
50
|
+
Field(
|
|
51
|
+
description=(
|
|
52
|
+
"Time to wait after volume is no longer used by any job before deleting it. "
|
|
53
|
+
"Defaults to keep the volume indefinitely. "
|
|
54
|
+
"Use the value 'off' or -1 to disable auto-cleanup."
|
|
55
|
+
)
|
|
56
|
+
),
|
|
57
|
+
] = None
|
|
47
58
|
tags: Annotated[
|
|
48
59
|
Optional[Dict[str, str]],
|
|
49
60
|
Field(
|
|
@@ -56,6 +67,9 @@ class VolumeConfiguration(CoreModel):
|
|
|
56
67
|
] = None
|
|
57
68
|
|
|
58
69
|
_validate_tags = validator("tags", pre=True, allow_reuse=True)(tags_validator)
|
|
70
|
+
_validate_auto_cleanup_duration = validator(
|
|
71
|
+
"auto_cleanup_duration", pre=True, allow_reuse=True
|
|
72
|
+
)(parse_idle_duration)
|
|
59
73
|
|
|
60
74
|
@property
|
|
61
75
|
def size_gb(self) -> int:
|