dstack 0.19.12rc1__py3-none-any.whl → 0.19.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/attach.py +4 -4
- dstack/_internal/cli/services/configurators/run.py +44 -47
- dstack/_internal/cli/utils/run.py +31 -31
- dstack/_internal/core/backends/aws/compute.py +22 -9
- dstack/_internal/core/backends/aws/resources.py +26 -0
- dstack/_internal/core/backends/base/offers.py +0 -1
- dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
- dstack/_internal/core/backends/template/models.py.jinja +4 -0
- dstack/_internal/core/compatibility/__init__.py +0 -0
- dstack/_internal/core/compatibility/fleets.py +72 -0
- dstack/_internal/core/compatibility/gateways.py +34 -0
- dstack/_internal/core/compatibility/runs.py +131 -0
- dstack/_internal/core/compatibility/volumes.py +32 -0
- dstack/_internal/core/models/configurations.py +1 -1
- dstack/_internal/core/models/fleets.py +6 -1
- dstack/_internal/core/models/instances.py +51 -12
- dstack/_internal/core/models/profiles.py +43 -3
- dstack/_internal/core/models/projects.py +1 -0
- dstack/_internal/core/models/repos/local.py +3 -3
- dstack/_internal/core/models/runs.py +139 -43
- dstack/_internal/server/app.py +46 -1
- dstack/_internal/server/background/tasks/process_running_jobs.py +92 -15
- dstack/_internal/server/background/tasks/process_runs.py +163 -80
- dstack/_internal/server/migrations/versions/35e90e1b0d3e_add_rolling_deployment_fields.py +42 -0
- dstack/_internal/server/migrations/versions/35f732ee4cf5_add_projectmodel_is_public.py +39 -0
- dstack/_internal/server/models.py +4 -0
- dstack/_internal/server/routers/projects.py +4 -3
- dstack/_internal/server/routers/prometheus.py +4 -1
- dstack/_internal/server/schemas/projects.py +1 -0
- dstack/_internal/server/security/permissions.py +36 -0
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/jobs/configurators/base.py +11 -7
- dstack/_internal/server/services/projects.py +54 -1
- dstack/_internal/server/services/runner/client.py +4 -1
- dstack/_internal/server/services/runs.py +49 -29
- dstack/_internal/server/services/services/__init__.py +19 -0
- dstack/_internal/server/services/services/autoscalers.py +37 -26
- dstack/_internal/server/services/storage/__init__.py +38 -0
- dstack/_internal/server/services/storage/base.py +27 -0
- dstack/_internal/server/services/storage/gcs.py +44 -0
- dstack/_internal/server/services/{storage.py → storage/s3.py} +4 -27
- dstack/_internal/server/settings.py +7 -3
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js → main-0ac1e1583684417ae4d1.js} +1695 -62
- dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js.map → main-0ac1e1583684417ae4d1.js.map} +1 -1
- dstack/_internal/server/statics/{main-8f9c66f404e9c7e7e020.css → main-f39c418b05fe14772dd8.css} +1 -1
- dstack/_internal/server/testing/common.py +11 -1
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +4 -0
- dstack/api/_public/runs.py +14 -5
- dstack/api/server/_fleets.py +9 -69
- dstack/api/server/_gateways.py +3 -14
- dstack/api/server/_projects.py +2 -2
- dstack/api/server/_runs.py +4 -116
- dstack/api/server/_volumes.py +3 -14
- dstack/plugins/builtin/rest_plugin/_plugin.py +24 -5
- dstack/version.py +2 -2
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/METADATA +1 -1
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/RECORD +62 -52
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/WHEEL +0 -0
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.12rc1.dist-info → dstack-0.19.14.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from typing import Any, Dict, Optional
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.configurations import ServiceConfiguration
|
|
4
|
+
from dstack._internal.core.models.runs import ApplyRunPlanInput, JobSubmission, RunSpec
|
|
5
|
+
from dstack._internal.server.schemas.runs import GetRunPlanRequest
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[Dict]:
|
|
9
|
+
"""
|
|
10
|
+
Returns `plan` exclude mapping to exclude certain fields from the request.
|
|
11
|
+
Use this method to exclude new fields when they are not set to keep
|
|
12
|
+
clients backward-compatibility with older servers.
|
|
13
|
+
"""
|
|
14
|
+
apply_plan_excludes = {}
|
|
15
|
+
run_spec_excludes = get_run_spec_excludes(plan.run_spec)
|
|
16
|
+
if run_spec_excludes is not None:
|
|
17
|
+
apply_plan_excludes["run_spec"] = run_spec_excludes
|
|
18
|
+
current_resource = plan.current_resource
|
|
19
|
+
if current_resource is not None:
|
|
20
|
+
current_resource_excludes = {}
|
|
21
|
+
current_resource_excludes["status_message"] = True
|
|
22
|
+
if current_resource.deployment_num == 0:
|
|
23
|
+
current_resource_excludes["deployment_num"] = True
|
|
24
|
+
apply_plan_excludes["current_resource"] = current_resource_excludes
|
|
25
|
+
current_resource_excludes["run_spec"] = get_run_spec_excludes(current_resource.run_spec)
|
|
26
|
+
job_submissions_excludes = {}
|
|
27
|
+
current_resource_excludes["jobs"] = {
|
|
28
|
+
"__all__": {"job_submissions": {"__all__": job_submissions_excludes}}
|
|
29
|
+
}
|
|
30
|
+
job_submissions = [js for j in current_resource.jobs for js in j.job_submissions]
|
|
31
|
+
if all(map(_should_exclude_job_submission_jpd_cpu_arch, job_submissions)):
|
|
32
|
+
job_submissions_excludes["job_provisioning_data"] = {
|
|
33
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
34
|
+
}
|
|
35
|
+
if all(map(_should_exclude_job_submission_jrd_cpu_arch, job_submissions)):
|
|
36
|
+
job_submissions_excludes["job_runtime_data"] = {
|
|
37
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
38
|
+
}
|
|
39
|
+
if all(js.exit_status is None for js in job_submissions):
|
|
40
|
+
job_submissions_excludes["exit_status"] = True
|
|
41
|
+
if all(js.deployment_num == 0 for js in job_submissions):
|
|
42
|
+
job_submissions_excludes["deployment_num"] = True
|
|
43
|
+
latest_job_submission = current_resource.latest_job_submission
|
|
44
|
+
if latest_job_submission is not None:
|
|
45
|
+
latest_job_submission_excludes = {}
|
|
46
|
+
current_resource_excludes["latest_job_submission"] = latest_job_submission_excludes
|
|
47
|
+
if _should_exclude_job_submission_jpd_cpu_arch(latest_job_submission):
|
|
48
|
+
latest_job_submission_excludes["job_provisioning_data"] = {
|
|
49
|
+
"instance_type": {"resources": {"cpu_arch"}}
|
|
50
|
+
}
|
|
51
|
+
if _should_exclude_job_submission_jrd_cpu_arch(latest_job_submission):
|
|
52
|
+
latest_job_submission_excludes["job_runtime_data"] = {
|
|
53
|
+
"offer": {"instance": {"resources": {"cpu_arch"}}}
|
|
54
|
+
}
|
|
55
|
+
if latest_job_submission.exit_status is None:
|
|
56
|
+
latest_job_submission_excludes["exit_status"] = True
|
|
57
|
+
if latest_job_submission.deployment_num == 0:
|
|
58
|
+
latest_job_submission_excludes["deployment_num"] = True
|
|
59
|
+
return {"plan": apply_plan_excludes}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_get_plan_excludes(request: GetRunPlanRequest) -> Optional[Dict]:
|
|
63
|
+
"""
|
|
64
|
+
Excludes new fields when they are not set to keep
|
|
65
|
+
clients backward-compatibility with older servers.
|
|
66
|
+
"""
|
|
67
|
+
get_plan_excludes = {}
|
|
68
|
+
run_spec_excludes = get_run_spec_excludes(request.run_spec)
|
|
69
|
+
if run_spec_excludes is not None:
|
|
70
|
+
get_plan_excludes["run_spec"] = run_spec_excludes
|
|
71
|
+
if request.max_offers is None:
|
|
72
|
+
get_plan_excludes["max_offers"] = True
|
|
73
|
+
return get_plan_excludes
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_run_spec_excludes(run_spec: RunSpec) -> Optional[Dict]:
|
|
77
|
+
"""
|
|
78
|
+
Returns `run_spec` exclude mapping to exclude certain fields from the request.
|
|
79
|
+
Use this method to exclude new fields when they are not set to keep
|
|
80
|
+
clients backward-compatibility with older servers.
|
|
81
|
+
"""
|
|
82
|
+
spec_excludes: dict[str, Any] = {}
|
|
83
|
+
configuration_excludes: dict[str, Any] = {}
|
|
84
|
+
profile_excludes: set[str] = set()
|
|
85
|
+
configuration = run_spec.configuration
|
|
86
|
+
profile = run_spec.profile
|
|
87
|
+
|
|
88
|
+
if configuration.fleets is None:
|
|
89
|
+
configuration_excludes["fleets"] = True
|
|
90
|
+
if profile is not None and profile.fleets is None:
|
|
91
|
+
profile_excludes.add("fleets")
|
|
92
|
+
if configuration.tags is None:
|
|
93
|
+
configuration_excludes["tags"] = True
|
|
94
|
+
if profile is not None and profile.tags is None:
|
|
95
|
+
profile_excludes.add("tags")
|
|
96
|
+
if isinstance(configuration, ServiceConfiguration) and not configuration.rate_limits:
|
|
97
|
+
configuration_excludes["rate_limits"] = True
|
|
98
|
+
if configuration.shell is None:
|
|
99
|
+
configuration_excludes["shell"] = True
|
|
100
|
+
if configuration.priority is None:
|
|
101
|
+
configuration_excludes["priority"] = True
|
|
102
|
+
if configuration.startup_order is None:
|
|
103
|
+
configuration_excludes["startup_order"] = True
|
|
104
|
+
if profile is not None and profile.startup_order is None:
|
|
105
|
+
profile_excludes.add("startup_order")
|
|
106
|
+
if configuration.stop_criteria is None:
|
|
107
|
+
configuration_excludes["stop_criteria"] = True
|
|
108
|
+
if profile is not None and profile.stop_criteria is None:
|
|
109
|
+
profile_excludes.add("stop_criteria")
|
|
110
|
+
|
|
111
|
+
if configuration_excludes:
|
|
112
|
+
spec_excludes["configuration"] = configuration_excludes
|
|
113
|
+
if profile_excludes:
|
|
114
|
+
spec_excludes["profile"] = profile_excludes
|
|
115
|
+
if spec_excludes:
|
|
116
|
+
return spec_excludes
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _should_exclude_job_submission_jpd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
121
|
+
try:
|
|
122
|
+
return job_submission.job_provisioning_data.instance_type.resources.cpu_arch is None
|
|
123
|
+
except AttributeError:
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _should_exclude_job_submission_jrd_cpu_arch(job_submission: JobSubmission) -> bool:
|
|
128
|
+
try:
|
|
129
|
+
return job_submission.job_runtime_data.offer.instance.resources.cpu_arch is None
|
|
130
|
+
except AttributeError:
|
|
131
|
+
return True
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.volumes import VolumeConfiguration, VolumeSpec
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_volume_spec_excludes(volume_spec: VolumeSpec) -> Dict:
|
|
7
|
+
"""
|
|
8
|
+
Returns `volume_spec` exclude mapping to exclude certain fields from the request.
|
|
9
|
+
Use this method to exclude new fields when they are not set to keep
|
|
10
|
+
clients backward-compatibility with older servers.
|
|
11
|
+
"""
|
|
12
|
+
spec_excludes = {}
|
|
13
|
+
spec_excludes["configuration"] = _get_volume_configuration_excludes(volume_spec.configuration)
|
|
14
|
+
return spec_excludes
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_create_volume_excludes(configuration: VolumeConfiguration) -> Dict:
|
|
18
|
+
"""
|
|
19
|
+
Returns an exclude mapping to exclude certain fields from the create volume request.
|
|
20
|
+
Use this method to exclude new fields when they are not set to keep
|
|
21
|
+
clients backward-compatibility with older servers.
|
|
22
|
+
"""
|
|
23
|
+
create_volume_excludes = {}
|
|
24
|
+
create_volume_excludes["configuration"] = _get_volume_configuration_excludes(configuration)
|
|
25
|
+
return create_volume_excludes
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _get_volume_configuration_excludes(configuration: VolumeConfiguration) -> Dict:
|
|
29
|
+
configuration_excludes = {}
|
|
30
|
+
if configuration.tags is None:
|
|
31
|
+
configuration_excludes["tags"] = True
|
|
32
|
+
return configuration_excludes
|
|
@@ -440,7 +440,7 @@ class ServiceConfigurationParams(CoreModel):
|
|
|
440
440
|
raise ValueError("The minimum number of replicas must be greater than or equal to 0")
|
|
441
441
|
if v.max < v.min:
|
|
442
442
|
raise ValueError(
|
|
443
|
-
"The maximum number of replicas must be greater than or equal to the
|
|
443
|
+
"The maximum number of replicas must be greater than or equal to the minimum number of replicas"
|
|
444
444
|
)
|
|
445
445
|
return v
|
|
446
446
|
|
|
@@ -20,6 +20,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
20
20
|
parse_idle_duration,
|
|
21
21
|
)
|
|
22
22
|
from dstack._internal.core.models.resources import Range, ResourcesSpec
|
|
23
|
+
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
23
24
|
from dstack._internal.utils.json_schema import add_extra_schema_types
|
|
24
25
|
from dstack._internal.utils.tags import tags_validator
|
|
25
26
|
|
|
@@ -207,7 +208,11 @@ class InstanceGroupParams(CoreModel):
|
|
|
207
208
|
spot_policy: Annotated[
|
|
208
209
|
Optional[SpotPolicy],
|
|
209
210
|
Field(
|
|
210
|
-
description=
|
|
211
|
+
description=(
|
|
212
|
+
"The policy for provisioning spot or on-demand instances:"
|
|
213
|
+
f" {list_enum_values_for_annotation(SpotPolicy)}."
|
|
214
|
+
f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
|
|
215
|
+
)
|
|
211
216
|
),
|
|
212
217
|
] = None
|
|
213
218
|
retry: Annotated[
|
|
@@ -48,29 +48,68 @@ class Resources(CoreModel):
|
|
|
48
48
|
gpus: List[Gpu]
|
|
49
49
|
spot: bool
|
|
50
50
|
disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
|
|
51
|
+
# TODO: make description a computed field after migrating to pydanticV2
|
|
51
52
|
description: str = ""
|
|
52
53
|
cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
|
|
53
54
|
|
|
54
|
-
|
|
55
|
+
@root_validator
|
|
56
|
+
def _description(cls, values) -> Dict:
|
|
57
|
+
try:
|
|
58
|
+
description = values["description"]
|
|
59
|
+
if not description:
|
|
60
|
+
cpus = values["cpus"]
|
|
61
|
+
memory_mib = values["memory_mib"]
|
|
62
|
+
gpus = values["gpus"]
|
|
63
|
+
disk_size_mib = values["disk"].size_mib
|
|
64
|
+
spot = values["spot"]
|
|
65
|
+
cpu_arch = values["cpu_arch"]
|
|
66
|
+
values["description"] = Resources._pretty_format(
|
|
67
|
+
cpus, cpu_arch, memory_mib, disk_size_mib, gpus, spot, include_spot=True
|
|
68
|
+
)
|
|
69
|
+
except KeyError:
|
|
70
|
+
return values
|
|
71
|
+
return values
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def _pretty_format(
|
|
75
|
+
cpus: int,
|
|
76
|
+
cpu_arch: Optional[gpuhunt.CPUArchitecture],
|
|
77
|
+
memory_mib: int,
|
|
78
|
+
disk_size_mib: int,
|
|
79
|
+
gpus: List[Gpu],
|
|
80
|
+
spot: bool,
|
|
81
|
+
include_spot: bool = False,
|
|
82
|
+
) -> str:
|
|
55
83
|
resources = {}
|
|
56
|
-
if
|
|
57
|
-
resources["cpus"] =
|
|
58
|
-
resources["cpu_arch"] =
|
|
59
|
-
if
|
|
60
|
-
resources["memory"] = f"{
|
|
61
|
-
if
|
|
62
|
-
resources["disk_size"] = f"{
|
|
63
|
-
if
|
|
64
|
-
gpu =
|
|
84
|
+
if cpus > 0:
|
|
85
|
+
resources["cpus"] = cpus
|
|
86
|
+
resources["cpu_arch"] = cpu_arch
|
|
87
|
+
if memory_mib > 0:
|
|
88
|
+
resources["memory"] = f"{memory_mib / 1024:.0f}GB"
|
|
89
|
+
if disk_size_mib > 0:
|
|
90
|
+
resources["disk_size"] = f"{disk_size_mib / 1024:.0f}GB"
|
|
91
|
+
if gpus:
|
|
92
|
+
gpu = gpus[0]
|
|
65
93
|
resources["gpu_name"] = gpu.name
|
|
66
|
-
resources["gpu_count"] = len(
|
|
94
|
+
resources["gpu_count"] = len(gpus)
|
|
67
95
|
if gpu.memory_mib > 0:
|
|
68
96
|
resources["gpu_memory"] = f"{gpu.memory_mib / 1024:.0f}GB"
|
|
69
97
|
output = pretty_resources(**resources)
|
|
70
|
-
if include_spot and
|
|
98
|
+
if include_spot and spot:
|
|
71
99
|
output += " (spot)"
|
|
72
100
|
return output
|
|
73
101
|
|
|
102
|
+
def pretty_format(self, include_spot: bool = False) -> str:
|
|
103
|
+
return Resources._pretty_format(
|
|
104
|
+
self.cpus,
|
|
105
|
+
self.cpu_arch,
|
|
106
|
+
self.memory_mib,
|
|
107
|
+
self.disk.size_mib,
|
|
108
|
+
self.gpus,
|
|
109
|
+
self.spot,
|
|
110
|
+
include_spot,
|
|
111
|
+
)
|
|
112
|
+
|
|
74
113
|
|
|
75
114
|
class InstanceType(CoreModel):
|
|
76
115
|
name: str
|
|
@@ -6,6 +6,7 @@ from typing_extensions import Annotated, Literal
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.backends.base import BackendType
|
|
8
8
|
from dstack._internal.core.models.common import CoreModel, Duration
|
|
9
|
+
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
9
10
|
from dstack._internal.utils.tags import tags_validator
|
|
10
11
|
|
|
11
12
|
DEFAULT_RETRY_DURATION = 3600
|
|
@@ -32,6 +33,17 @@ class TerminationPolicy(str, Enum):
|
|
|
32
33
|
DESTROY_AFTER_IDLE = "destroy-after-idle"
|
|
33
34
|
|
|
34
35
|
|
|
36
|
+
class StartupOrder(str, Enum):
|
|
37
|
+
ANY = "any"
|
|
38
|
+
MASTER_FIRST = "master-first"
|
|
39
|
+
WORKERS_FIRST = "workers-first"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class StopCriteria(str, Enum):
|
|
43
|
+
ALL_DONE = "all-done"
|
|
44
|
+
MASTER_DONE = "master-done"
|
|
45
|
+
|
|
46
|
+
|
|
35
47
|
@overload
|
|
36
48
|
def parse_duration(v: None) -> None: ...
|
|
37
49
|
|
|
@@ -102,7 +114,7 @@ class ProfileRetry(CoreModel):
|
|
|
102
114
|
Field(
|
|
103
115
|
description=(
|
|
104
116
|
"The list of events that should be handled with retry."
|
|
105
|
-
" Supported events are
|
|
117
|
+
f" Supported events are {list_enum_values_for_annotation(RetryEvent)}."
|
|
106
118
|
" Omit to retry on all events"
|
|
107
119
|
)
|
|
108
120
|
),
|
|
@@ -190,7 +202,11 @@ class ProfileParams(CoreModel):
|
|
|
190
202
|
spot_policy: Annotated[
|
|
191
203
|
Optional[SpotPolicy],
|
|
192
204
|
Field(
|
|
193
|
-
description=
|
|
205
|
+
description=(
|
|
206
|
+
"The policy for provisioning spot or on-demand instances:"
|
|
207
|
+
f" {list_enum_values_for_annotation(SpotPolicy)}."
|
|
208
|
+
f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
|
|
209
|
+
)
|
|
194
210
|
),
|
|
195
211
|
] = None
|
|
196
212
|
retry: Annotated[
|
|
@@ -225,7 +241,11 @@ class ProfileParams(CoreModel):
|
|
|
225
241
|
creation_policy: Annotated[
|
|
226
242
|
Optional[CreationPolicy],
|
|
227
243
|
Field(
|
|
228
|
-
description=
|
|
244
|
+
description=(
|
|
245
|
+
"The policy for using instances from fleets:"
|
|
246
|
+
f" {list_enum_values_for_annotation(CreationPolicy)}."
|
|
247
|
+
f" Defaults to `{CreationPolicy.REUSE_OR_CREATE.value}`"
|
|
248
|
+
)
|
|
229
249
|
),
|
|
230
250
|
] = None
|
|
231
251
|
idle_duration: Annotated[
|
|
@@ -241,6 +261,26 @@ class ProfileParams(CoreModel):
|
|
|
241
261
|
Optional[UtilizationPolicy],
|
|
242
262
|
Field(description="Run termination policy based on utilization"),
|
|
243
263
|
] = None
|
|
264
|
+
startup_order: Annotated[
|
|
265
|
+
Optional[StartupOrder],
|
|
266
|
+
Field(
|
|
267
|
+
description=(
|
|
268
|
+
f"The order in which master and workers jobs are started:"
|
|
269
|
+
f" {list_enum_values_for_annotation(StartupOrder)}."
|
|
270
|
+
f" Defaults to `{StartupOrder.ANY.value}`"
|
|
271
|
+
)
|
|
272
|
+
),
|
|
273
|
+
] = None
|
|
274
|
+
stop_criteria: Annotated[
|
|
275
|
+
Optional[StopCriteria],
|
|
276
|
+
Field(
|
|
277
|
+
description=(
|
|
278
|
+
"The criteria determining when a multi-node run should be considered finished:"
|
|
279
|
+
f" {list_enum_values_for_annotation(StopCriteria)}."
|
|
280
|
+
f" Defaults to `{StopCriteria.ALL_DONE.value}`"
|
|
281
|
+
)
|
|
282
|
+
),
|
|
283
|
+
] = None
|
|
244
284
|
fleets: Annotated[
|
|
245
285
|
Optional[list[str]], Field(description="The fleets considered for reuse")
|
|
246
286
|
] = None
|
|
@@ -84,9 +84,9 @@ class LocalRepo(Repo):
|
|
|
84
84
|
.add_custom_ignore_filename(".dstackignore")
|
|
85
85
|
.build()
|
|
86
86
|
):
|
|
87
|
-
|
|
88
|
-
if
|
|
89
|
-
t.add(path, recursive=False)
|
|
87
|
+
entry_path_within_repo = entry.path().relative_to(repo_path)
|
|
88
|
+
if entry_path_within_repo != Path("."):
|
|
89
|
+
t.add(entry.path(), arcname=entry_path_within_repo, recursive=False)
|
|
90
90
|
logger.debug("Code file size: %s", sizeof_fmt(fp.tell()))
|
|
91
91
|
return get_sha256(fp)
|
|
92
92
|
|
|
@@ -148,8 +148,18 @@ class JobTerminationReason(str, Enum):
|
|
|
148
148
|
}
|
|
149
149
|
return mapping[self]
|
|
150
150
|
|
|
151
|
-
def
|
|
152
|
-
|
|
151
|
+
def to_retry_event(self) -> Optional[RetryEvent]:
|
|
152
|
+
"""
|
|
153
|
+
Returns:
|
|
154
|
+
the retry event this termination reason triggers
|
|
155
|
+
or None if this termination reason should not be retried
|
|
156
|
+
"""
|
|
157
|
+
mapping = {
|
|
158
|
+
self.FAILED_TO_START_DUE_TO_NO_CAPACITY: RetryEvent.NO_CAPACITY,
|
|
159
|
+
self.INTERRUPTED_BY_NO_CAPACITY: RetryEvent.INTERRUPTION,
|
|
160
|
+
}
|
|
161
|
+
default = RetryEvent.ERROR if self.to_status() == JobStatus.FAILED else None
|
|
162
|
+
return mapping.get(self, default)
|
|
153
163
|
|
|
154
164
|
|
|
155
165
|
class Requirements(CoreModel):
|
|
@@ -279,6 +289,7 @@ class ClusterInfo(CoreModel):
|
|
|
279
289
|
class JobSubmission(CoreModel):
|
|
280
290
|
id: UUID4
|
|
281
291
|
submission_num: int
|
|
292
|
+
deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
|
|
282
293
|
submitted_at: datetime
|
|
283
294
|
last_processed_at: datetime
|
|
284
295
|
finished_at: Optional[datetime]
|
|
@@ -289,6 +300,9 @@ class JobSubmission(CoreModel):
|
|
|
289
300
|
exit_status: Optional[int]
|
|
290
301
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
291
302
|
job_runtime_data: Optional[JobRuntimeData]
|
|
303
|
+
# TODO: make status_message and error a computed field after migrating to pydanticV2
|
|
304
|
+
status_message: Optional[str]
|
|
305
|
+
error: Optional[str] = None
|
|
292
306
|
|
|
293
307
|
@property
|
|
294
308
|
def age(self) -> timedelta:
|
|
@@ -301,6 +315,71 @@ class JobSubmission(CoreModel):
|
|
|
301
315
|
end_time = self.finished_at
|
|
302
316
|
return end_time - self.submitted_at
|
|
303
317
|
|
|
318
|
+
@root_validator
|
|
319
|
+
def _status_message(cls, values) -> Dict:
|
|
320
|
+
try:
|
|
321
|
+
status = values["status"]
|
|
322
|
+
termination_reason = values["termination_reason"]
|
|
323
|
+
exit_code = values["exit_status"]
|
|
324
|
+
except KeyError:
|
|
325
|
+
return values
|
|
326
|
+
values["status_message"] = JobSubmission._get_status_message(
|
|
327
|
+
status=status,
|
|
328
|
+
termination_reason=termination_reason,
|
|
329
|
+
exit_status=exit_code,
|
|
330
|
+
)
|
|
331
|
+
return values
|
|
332
|
+
|
|
333
|
+
@staticmethod
|
|
334
|
+
def _get_status_message(
|
|
335
|
+
status: JobStatus,
|
|
336
|
+
termination_reason: Optional[JobTerminationReason],
|
|
337
|
+
exit_status: Optional[int],
|
|
338
|
+
) -> str:
|
|
339
|
+
if status == JobStatus.DONE:
|
|
340
|
+
return "exited (0)"
|
|
341
|
+
elif status == JobStatus.FAILED:
|
|
342
|
+
if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
343
|
+
return f"exited ({exit_status})"
|
|
344
|
+
elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
|
|
345
|
+
return "no offers"
|
|
346
|
+
elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
347
|
+
return "interrupted"
|
|
348
|
+
else:
|
|
349
|
+
return "error"
|
|
350
|
+
elif status == JobStatus.TERMINATED:
|
|
351
|
+
if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
352
|
+
return "stopped"
|
|
353
|
+
elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
354
|
+
return "aborted"
|
|
355
|
+
return status.value
|
|
356
|
+
|
|
357
|
+
@root_validator
|
|
358
|
+
def _error(cls, values) -> Dict:
|
|
359
|
+
try:
|
|
360
|
+
termination_reason = values["termination_reason"]
|
|
361
|
+
except KeyError:
|
|
362
|
+
return values
|
|
363
|
+
values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
|
|
364
|
+
return values
|
|
365
|
+
|
|
366
|
+
@staticmethod
|
|
367
|
+
def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
|
|
368
|
+
error_mapping = {
|
|
369
|
+
JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
|
|
370
|
+
JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
|
|
371
|
+
JobTerminationReason.VOLUME_ERROR: "volume error",
|
|
372
|
+
JobTerminationReason.GATEWAY_ERROR: "gateway error",
|
|
373
|
+
JobTerminationReason.SCALED_DOWN: "scaled down",
|
|
374
|
+
JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
|
|
375
|
+
JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
|
|
376
|
+
JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
|
|
377
|
+
JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
|
|
378
|
+
JobTerminationReason.EXECUTOR_ERROR: "executor error",
|
|
379
|
+
JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
|
|
380
|
+
}
|
|
381
|
+
return error_mapping.get(termination_reason)
|
|
382
|
+
|
|
304
383
|
|
|
305
384
|
class Job(CoreModel):
|
|
306
385
|
job_spec: JobSpec
|
|
@@ -431,12 +510,14 @@ class Run(CoreModel):
|
|
|
431
510
|
submitted_at: datetime
|
|
432
511
|
last_processed_at: datetime
|
|
433
512
|
status: RunStatus
|
|
513
|
+
status_message: Optional[str] = None
|
|
434
514
|
termination_reason: Optional[RunTerminationReason]
|
|
435
515
|
run_spec: RunSpec
|
|
436
516
|
jobs: List[Job]
|
|
437
517
|
latest_job_submission: Optional[JobSubmission]
|
|
438
518
|
cost: float = 0
|
|
439
519
|
service: Optional[ServiceSpec] = None
|
|
520
|
+
deployment_num: int = 0 # default for compatibility with pre-0.19.14 servers
|
|
440
521
|
# TODO: make error a computed field after migrating to pydanticV2
|
|
441
522
|
error: Optional[str] = None
|
|
442
523
|
deleted: Optional[bool] = None
|
|
@@ -445,15 +526,67 @@ class Run(CoreModel):
|
|
|
445
526
|
def _error(cls, values) -> Dict:
|
|
446
527
|
try:
|
|
447
528
|
termination_reason = values["termination_reason"]
|
|
448
|
-
jobs = values["jobs"]
|
|
449
529
|
except KeyError:
|
|
450
530
|
return values
|
|
451
|
-
values["error"] =
|
|
452
|
-
|
|
453
|
-
|
|
531
|
+
values["error"] = Run._get_error(termination_reason=termination_reason)
|
|
532
|
+
return values
|
|
533
|
+
|
|
534
|
+
@staticmethod
|
|
535
|
+
def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
|
|
536
|
+
if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
|
|
537
|
+
return "retry limit exceeded"
|
|
538
|
+
elif termination_reason == RunTerminationReason.SERVER_ERROR:
|
|
539
|
+
return "server error"
|
|
540
|
+
else:
|
|
541
|
+
return None
|
|
542
|
+
|
|
543
|
+
@root_validator
|
|
544
|
+
def _status_message(cls, values) -> Dict:
|
|
545
|
+
try:
|
|
546
|
+
status = values["status"]
|
|
547
|
+
jobs: List[Job] = values["jobs"]
|
|
548
|
+
retry_on_events = (
|
|
549
|
+
jobs[0].job_spec.retry.on_events if jobs and jobs[0].job_spec.retry else []
|
|
550
|
+
)
|
|
551
|
+
termination_reason = Run.get_last_termination_reason(jobs[0]) if jobs else None
|
|
552
|
+
except KeyError:
|
|
553
|
+
return values
|
|
554
|
+
values["status_message"] = Run._get_status_message(
|
|
555
|
+
status=status,
|
|
556
|
+
retry_on_events=retry_on_events,
|
|
557
|
+
termination_reason=termination_reason,
|
|
454
558
|
)
|
|
455
559
|
return values
|
|
456
560
|
|
|
561
|
+
@staticmethod
|
|
562
|
+
def get_last_termination_reason(job: "Job") -> Optional[JobTerminationReason]:
|
|
563
|
+
for submission in reversed(job.job_submissions):
|
|
564
|
+
if submission.termination_reason is not None:
|
|
565
|
+
return submission.termination_reason
|
|
566
|
+
return None
|
|
567
|
+
|
|
568
|
+
@staticmethod
|
|
569
|
+
def _get_status_message(
|
|
570
|
+
status: RunStatus,
|
|
571
|
+
retry_on_events: List[RetryEvent],
|
|
572
|
+
termination_reason: Optional[JobTerminationReason],
|
|
573
|
+
) -> str:
|
|
574
|
+
# Currently, `retrying` is shown only for `no-capacity` events
|
|
575
|
+
if (
|
|
576
|
+
status in [RunStatus.SUBMITTED, RunStatus.PENDING]
|
|
577
|
+
and termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY
|
|
578
|
+
and RetryEvent.NO_CAPACITY in retry_on_events
|
|
579
|
+
):
|
|
580
|
+
return "retrying"
|
|
581
|
+
return status.value
|
|
582
|
+
|
|
583
|
+
def is_deployment_in_progress(self) -> bool:
|
|
584
|
+
return any(
|
|
585
|
+
not j.job_submissions[-1].status.is_finished()
|
|
586
|
+
and j.job_submissions[-1].deployment_num != self.deployment_num
|
|
587
|
+
for j in self.jobs
|
|
588
|
+
)
|
|
589
|
+
|
|
457
590
|
|
|
458
591
|
class JobPlan(CoreModel):
|
|
459
592
|
job_spec: JobSpec
|
|
@@ -502,40 +635,3 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
|
|
|
502
635
|
SpotPolicy.ONDEMAND: False,
|
|
503
636
|
}
|
|
504
637
|
return policy_map[spot_policy]
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
def _get_run_error(
|
|
508
|
-
run_termination_reason: Optional[RunTerminationReason],
|
|
509
|
-
run_jobs: List[Job],
|
|
510
|
-
) -> str:
|
|
511
|
-
if run_termination_reason is None:
|
|
512
|
-
return ""
|
|
513
|
-
if len(run_jobs) > 1:
|
|
514
|
-
return run_termination_reason.name
|
|
515
|
-
run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
|
|
516
|
-
run_jobs
|
|
517
|
-
)
|
|
518
|
-
# For failed runs, also show termination reason to provide more context.
|
|
519
|
-
# For other run statuses, the job termination reason will duplicate run status.
|
|
520
|
-
if run_job_termination_reason is not None and run_termination_reason in [
|
|
521
|
-
RunTerminationReason.JOB_FAILED,
|
|
522
|
-
RunTerminationReason.SERVER_ERROR,
|
|
523
|
-
RunTerminationReason.RETRY_LIMIT_EXCEEDED,
|
|
524
|
-
]:
|
|
525
|
-
if exit_status:
|
|
526
|
-
return (
|
|
527
|
-
f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
|
|
528
|
-
)
|
|
529
|
-
return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
|
|
530
|
-
return run_termination_reason.name
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
def _get_run_job_termination_reason_and_exit_status(
|
|
534
|
-
run_jobs: List[Job],
|
|
535
|
-
) -> tuple[Optional[JobTerminationReason], Optional[int]]:
|
|
536
|
-
for job in run_jobs:
|
|
537
|
-
if len(job.job_submissions) > 0:
|
|
538
|
-
job_submission = job.job_submissions[-1]
|
|
539
|
-
if job_submission.termination_reason is not None:
|
|
540
|
-
return job_submission.termination_reason, job_submission.exit_status
|
|
541
|
-
return None, None
|