dstack 0.19.11rc2__py3-none-any.whl → 0.19.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/offer.py +2 -0
- dstack/_internal/cli/services/configurators/run.py +43 -42
- dstack/_internal/cli/utils/run.py +10 -26
- dstack/_internal/core/backends/template/configurator.py.jinja +1 -6
- dstack/_internal/core/backends/template/models.py.jinja +4 -0
- dstack/_internal/core/models/configurations.py +1 -1
- dstack/_internal/core/models/fleets.py +6 -1
- dstack/_internal/core/models/profiles.py +43 -3
- dstack/_internal/core/models/repos/local.py +19 -13
- dstack/_internal/core/models/runs.py +78 -45
- dstack/_internal/server/background/tasks/process_running_jobs.py +47 -12
- dstack/_internal/server/background/tasks/process_runs.py +14 -1
- dstack/_internal/server/services/fleets.py +2 -2
- dstack/_internal/server/services/gateways/__init__.py +1 -1
- dstack/_internal/server/services/plugins.py +3 -2
- dstack/_internal/server/services/runner/client.py +4 -1
- dstack/_internal/server/services/runs.py +2 -2
- dstack/_internal/server/services/volumes.py +1 -1
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js → main-b0e80f8e26a168c129e9.js} +72 -25
- dstack/_internal/server/statics/{main-5b9786c955b42bf93581.js.map → main-b0e80f8e26a168c129e9.js.map} +1 -1
- dstack/_internal/server/testing/common.py +2 -1
- dstack/_internal/utils/common.py +4 -0
- dstack/api/server/_fleets.py +5 -1
- dstack/api/server/_runs.py +8 -0
- dstack/version.py +1 -1
- {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/METADATA +2 -1
- {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/RECORD +31 -32
- dstack/_internal/utils/ignore.py +0 -92
- {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/WHEEL +0 -0
- {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.11rc2.dist-info → dstack-0.19.12.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -84,6 +84,8 @@ class OfferCommand(APIBaseCommand):
|
|
|
84
84
|
job_plan = run_plan.job_plans[0]
|
|
85
85
|
|
|
86
86
|
if args.format == "json":
|
|
87
|
+
# FIXME: Should use effective_run_spec from run_plan,
|
|
88
|
+
# since the spec can be changed by the server and plugins
|
|
87
89
|
output = {
|
|
88
90
|
"project": run_plan.project_name,
|
|
89
91
|
"user": run_plan.user,
|
|
@@ -3,7 +3,7 @@ import subprocess
|
|
|
3
3
|
import sys
|
|
4
4
|
import time
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Dict, List, Optional, Set
|
|
6
|
+
from typing import Dict, List, Optional, Set
|
|
7
7
|
|
|
8
8
|
import gpuhunt
|
|
9
9
|
from pydantic import parse_obj_as
|
|
@@ -41,7 +41,7 @@ from dstack._internal.core.models.configurations import (
|
|
|
41
41
|
)
|
|
42
42
|
from dstack._internal.core.models.repos.base import Repo
|
|
43
43
|
from dstack._internal.core.models.resources import CPUSpec
|
|
44
|
-
from dstack._internal.core.models.runs import
|
|
44
|
+
from dstack._internal.core.models.runs import JobStatus, JobSubmission, RunStatus
|
|
45
45
|
from dstack._internal.core.services.configs import ConfigManager
|
|
46
46
|
from dstack._internal.core.services.diff import diff_models
|
|
47
47
|
from dstack._internal.utils.common import local_time
|
|
@@ -105,7 +105,7 @@ class BaseRunConfigurator(ApplyEnvVarsConfiguratorMixin, BaseApplyConfigurator):
|
|
|
105
105
|
changed_fields = []
|
|
106
106
|
if run_plan.action == ApplyAction.UPDATE:
|
|
107
107
|
diff = diff_models(
|
|
108
|
-
run_plan.
|
|
108
|
+
run_plan.get_effective_run_spec().configuration,
|
|
109
109
|
run_plan.current_resource.run_spec.configuration,
|
|
110
110
|
)
|
|
111
111
|
changed_fields = list(diff.keys())
|
|
@@ -553,35 +553,38 @@ def _print_service_urls(run: Run) -> None:
|
|
|
553
553
|
|
|
554
554
|
|
|
555
555
|
def print_finished_message(run: Run):
|
|
556
|
+
status_message = (
|
|
557
|
+
run._run.latest_job_submission.status_message
|
|
558
|
+
if run._run.latest_job_submission
|
|
559
|
+
else run._run.status_message
|
|
560
|
+
)
|
|
561
|
+
error = (
|
|
562
|
+
run._run.latest_job_submission.error if run._run.latest_job_submission else run._run.error
|
|
563
|
+
)
|
|
564
|
+
termination_reason = (
|
|
565
|
+
run._run.latest_job_submission.termination_reason
|
|
566
|
+
if run._run.latest_job_submission
|
|
567
|
+
else None
|
|
568
|
+
)
|
|
569
|
+
termination_reason_message = (
|
|
570
|
+
run._run.latest_job_submission.termination_reason_message
|
|
571
|
+
if run._run.latest_job_submission
|
|
572
|
+
else None
|
|
573
|
+
)
|
|
556
574
|
if run.status == RunStatus.DONE:
|
|
557
|
-
console.print("[code]
|
|
575
|
+
console.print(f"[code]{status_message.capitalize()}[/code]")
|
|
558
576
|
return
|
|
577
|
+
else:
|
|
578
|
+
str = f"[error]{status_message.capitalize()}[/error]"
|
|
579
|
+
if error:
|
|
580
|
+
str += f" ([error]{error.capitalize()}[/error])"
|
|
581
|
+
console.print(str)
|
|
559
582
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
message = "Run terminated due to unknown reason. Check CLI, server, and run logs."
|
|
566
|
-
|
|
567
|
-
if termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
|
|
568
|
-
message = (
|
|
569
|
-
"All provisioning attempts failed. "
|
|
570
|
-
"This is likely due to cloud providers not having enough capacity. "
|
|
571
|
-
"Check CLI and server logs for more details."
|
|
572
|
-
)
|
|
573
|
-
elif termination_reason is not None:
|
|
574
|
-
exit_status_details = f"Exit status: {exit_status}.\n" if exit_status else ""
|
|
575
|
-
error_details = (
|
|
576
|
-
f"Error: {termination_reason_message}\n" if termination_reason_message else ""
|
|
577
|
-
)
|
|
578
|
-
message = (
|
|
579
|
-
f"Run failed with error code {termination_reason.name}.\n"
|
|
580
|
-
f"{exit_status_details}"
|
|
581
|
-
f"{error_details}"
|
|
582
|
-
f"Check [bold]dstack logs -d {run.name}[/bold] for more details."
|
|
583
|
-
)
|
|
584
|
-
console.print(f"[error]{message}[/]")
|
|
583
|
+
if termination_reason_message:
|
|
584
|
+
console.print(f"[error]{termination_reason_message}[/error]")
|
|
585
|
+
|
|
586
|
+
if termination_reason:
|
|
587
|
+
console.print(f"Check [code]dstack logs -d {run.name}[/code] for more details.")
|
|
585
588
|
|
|
586
589
|
|
|
587
590
|
def get_run_exit_code(run: Run) -> int:
|
|
@@ -590,19 +593,17 @@ def get_run_exit_code(run: Run) -> int:
|
|
|
590
593
|
return 1
|
|
591
594
|
|
|
592
595
|
|
|
593
|
-
def
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
job_submission.termination_reason_message,
|
|
605
|
-
job_submission.exit_status,
|
|
596
|
+
def _is_ready_to_attach(run: Run) -> bool:
|
|
597
|
+
return not (
|
|
598
|
+
run.status
|
|
599
|
+
in [
|
|
600
|
+
RunStatus.SUBMITTED,
|
|
601
|
+
RunStatus.PENDING,
|
|
602
|
+
RunStatus.PROVISIONING,
|
|
603
|
+
RunStatus.TERMINATING,
|
|
604
|
+
]
|
|
605
|
+
or run._run.jobs[0].job_submissions[-1].status
|
|
606
|
+
in [JobStatus.SUBMITTED, JobStatus.PROVISIONING, JobStatus.PULLING]
|
|
606
607
|
)
|
|
607
608
|
|
|
608
609
|
|
|
@@ -12,7 +12,6 @@ from dstack._internal.core.models.profiles import (
|
|
|
12
12
|
TerminationPolicy,
|
|
13
13
|
)
|
|
14
14
|
from dstack._internal.core.models.runs import (
|
|
15
|
-
Job,
|
|
16
15
|
RunPlan,
|
|
17
16
|
)
|
|
18
17
|
from dstack._internal.core.services.profiles import get_termination
|
|
@@ -154,8 +153,7 @@ def get_runs_table(
|
|
|
154
153
|
table.add_column("BACKEND", style="grey58", ratio=2)
|
|
155
154
|
table.add_column("RESOURCES", ratio=3 if not verbose else 2)
|
|
156
155
|
if verbose:
|
|
157
|
-
table.add_column("INSTANCE", no_wrap=True, ratio=1)
|
|
158
|
-
table.add_column("RESERVATION", no_wrap=True, ratio=1)
|
|
156
|
+
table.add_column("INSTANCE TYPE", no_wrap=True, ratio=1)
|
|
159
157
|
table.add_column("PRICE", style="grey58", ratio=1)
|
|
160
158
|
table.add_column("STATUS", no_wrap=True, ratio=1)
|
|
161
159
|
table.add_column("SUBMITTED", style="grey58", no_wrap=True, ratio=1)
|
|
@@ -163,14 +161,14 @@ def get_runs_table(
|
|
|
163
161
|
table.add_column("ERROR", no_wrap=True, ratio=2)
|
|
164
162
|
|
|
165
163
|
for run in runs:
|
|
166
|
-
run_error = _get_run_error(run)
|
|
167
164
|
run = run._run # TODO(egor-s): make public attribute
|
|
168
165
|
|
|
169
166
|
run_row: Dict[Union[str, int], Any] = {
|
|
170
167
|
"NAME": run.run_spec.run_name,
|
|
171
168
|
"SUBMITTED": format_date(run.submitted_at),
|
|
172
|
-
"ERROR": run_error,
|
|
173
169
|
}
|
|
170
|
+
if run.error:
|
|
171
|
+
run_row["ERROR"] = run.error
|
|
174
172
|
if len(run.jobs) != 1:
|
|
175
173
|
run_row["STATUS"] = run.status
|
|
176
174
|
add_row_from_dict(table, run_row)
|
|
@@ -183,25 +181,26 @@ def get_runs_table(
|
|
|
183
181
|
status += f" (inactive for {inactive_for})"
|
|
184
182
|
job_row: Dict[Union[str, int], Any] = {
|
|
185
183
|
"NAME": f" replica={job.job_spec.replica_num} job={job.job_spec.job_num}",
|
|
186
|
-
"STATUS":
|
|
184
|
+
"STATUS": latest_job_submission.status_message,
|
|
187
185
|
"SUBMITTED": format_date(latest_job_submission.submitted_at),
|
|
188
|
-
"ERROR":
|
|
186
|
+
"ERROR": latest_job_submission.error,
|
|
189
187
|
}
|
|
190
188
|
jpd = latest_job_submission.job_provisioning_data
|
|
191
189
|
if jpd is not None:
|
|
192
190
|
resources = jpd.instance_type.resources
|
|
193
|
-
|
|
191
|
+
instance_type = jpd.instance_type.name
|
|
194
192
|
jrd = latest_job_submission.job_runtime_data
|
|
195
193
|
if jrd is not None and jrd.offer is not None:
|
|
196
194
|
resources = jrd.offer.instance.resources
|
|
197
195
|
if jrd.offer.total_blocks > 1:
|
|
198
|
-
|
|
196
|
+
instance_type += f" ({jrd.offer.blocks}/{jrd.offer.total_blocks})"
|
|
197
|
+
if jpd.reservation:
|
|
198
|
+
instance_type += f" ({jpd.reservation})"
|
|
199
199
|
job_row.update(
|
|
200
200
|
{
|
|
201
201
|
"BACKEND": f"{jpd.backend.value.replace('remote', 'ssh')} ({jpd.region})",
|
|
202
202
|
"RESOURCES": resources.pretty_format(include_spot=True),
|
|
203
|
-
"INSTANCE":
|
|
204
|
-
"RESERVATION": jpd.reservation,
|
|
203
|
+
"INSTANCE TYPE": instance_type,
|
|
205
204
|
"PRICE": f"${jpd.price:.4f}".rstrip("0").rstrip("."),
|
|
206
205
|
}
|
|
207
206
|
)
|
|
@@ -211,18 +210,3 @@ def get_runs_table(
|
|
|
211
210
|
add_row_from_dict(table, job_row, style="secondary" if len(run.jobs) != 1 else None)
|
|
212
211
|
|
|
213
212
|
return table
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def _get_run_error(run: Run) -> str:
|
|
217
|
-
return run._run.error or ""
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
def _get_job_error(job: Job) -> str:
|
|
221
|
-
job_submission = job.job_submissions[-1]
|
|
222
|
-
termination_reason = job_submission.termination_reason
|
|
223
|
-
exit_status = job_submission.exit_status
|
|
224
|
-
if termination_reason is None:
|
|
225
|
-
return ""
|
|
226
|
-
if exit_status:
|
|
227
|
-
return f"{termination_reason.name} {exit_status}"
|
|
228
|
-
return termination_reason.name
|
|
@@ -19,9 +19,6 @@ from dstack._internal.core.models.backends.base import (
|
|
|
19
19
|
BackendType,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
|
-
# TODO: Add all supported regions and default regions
|
|
23
|
-
REGIONS = []
|
|
24
|
-
|
|
25
22
|
|
|
26
23
|
class {{ backend_name }}Configurator(Configurator):
|
|
27
24
|
TYPE = BackendType.{{ backend_name|upper }}
|
|
@@ -31,13 +28,11 @@ class {{ backend_name }}Configurator(Configurator):
|
|
|
31
28
|
self, config: {{ backend_name }}BackendConfigWithCreds, default_creds_enabled: bool
|
|
32
29
|
):
|
|
33
30
|
self._validate_creds(config.creds)
|
|
34
|
-
# TODO:
|
|
31
|
+
# TODO: If possible, validate config.regions and any other config parameters
|
|
35
32
|
|
|
36
33
|
def create_backend(
|
|
37
34
|
self, project_name: str, config: {{ backend_name }}BackendConfigWithCreds
|
|
38
35
|
) -> BackendRecord:
|
|
39
|
-
if config.regions is None:
|
|
40
|
-
config.regions = REGIONS
|
|
41
36
|
return BackendRecord(
|
|
42
37
|
config={{ backend_name }}StoredConfig(
|
|
43
38
|
**{{ backend_name }}BackendConfig.__response__.parse_obj(config).dict()
|
|
@@ -22,6 +22,7 @@ class {{ backend_name }}BackendConfig(CoreModel):
|
|
|
22
22
|
It also serves as a base class for other backend config models.
|
|
23
23
|
Should not include creds.
|
|
24
24
|
"""
|
|
25
|
+
|
|
25
26
|
type: Annotated[
|
|
26
27
|
Literal["{{ backend_name|lower }}"],
|
|
27
28
|
Field(description="The type of backend"),
|
|
@@ -37,6 +38,7 @@ class {{ backend_name }}BackendConfigWithCreds({{ backend_name }}BackendConfig):
|
|
|
37
38
|
"""
|
|
38
39
|
Same as `{{ backend_name }}BackendConfig` but also includes creds.
|
|
39
40
|
"""
|
|
41
|
+
|
|
40
42
|
creds: Annotated[Any{{ backend_name }}Creds, Field(description="The credentials")]
|
|
41
43
|
|
|
42
44
|
|
|
@@ -48,6 +50,7 @@ class {{ backend_name }}StoredConfig({{ backend_name }}BackendConfig):
|
|
|
48
50
|
The backend config used for config parameters in the DB.
|
|
49
51
|
Can extend `{{ backend_name }}BackendConfig` with additional parameters.
|
|
50
52
|
"""
|
|
53
|
+
|
|
51
54
|
pass
|
|
52
55
|
|
|
53
56
|
|
|
@@ -55,4 +58,5 @@ class {{ backend_name }}Config({{ backend_name }}StoredConfig):
|
|
|
55
58
|
"""
|
|
56
59
|
The backend config used by `{{ backend_name }}Backend` and `{{ backend_name }}Compute`.
|
|
57
60
|
"""
|
|
61
|
+
|
|
58
62
|
creds: Any{{ backend_name }}Creds
|
|
@@ -440,7 +440,7 @@ class ServiceConfigurationParams(CoreModel):
|
|
|
440
440
|
raise ValueError("The minimum number of replicas must be greater than or equal to 0")
|
|
441
441
|
if v.max < v.min:
|
|
442
442
|
raise ValueError(
|
|
443
|
-
"The maximum number of replicas must be greater than or equal to the
|
|
443
|
+
"The maximum number of replicas must be greater than or equal to the minimum number of replicas"
|
|
444
444
|
)
|
|
445
445
|
return v
|
|
446
446
|
|
|
@@ -20,6 +20,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
20
20
|
parse_idle_duration,
|
|
21
21
|
)
|
|
22
22
|
from dstack._internal.core.models.resources import Range, ResourcesSpec
|
|
23
|
+
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
23
24
|
from dstack._internal.utils.json_schema import add_extra_schema_types
|
|
24
25
|
from dstack._internal.utils.tags import tags_validator
|
|
25
26
|
|
|
@@ -207,7 +208,11 @@ class InstanceGroupParams(CoreModel):
|
|
|
207
208
|
spot_policy: Annotated[
|
|
208
209
|
Optional[SpotPolicy],
|
|
209
210
|
Field(
|
|
210
|
-
description=
|
|
211
|
+
description=(
|
|
212
|
+
"The policy for provisioning spot or on-demand instances:"
|
|
213
|
+
f" {list_enum_values_for_annotation(SpotPolicy)}."
|
|
214
|
+
f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
|
|
215
|
+
)
|
|
211
216
|
),
|
|
212
217
|
] = None
|
|
213
218
|
retry: Annotated[
|
|
@@ -6,6 +6,7 @@ from typing_extensions import Annotated, Literal
|
|
|
6
6
|
|
|
7
7
|
from dstack._internal.core.models.backends.base import BackendType
|
|
8
8
|
from dstack._internal.core.models.common import CoreModel, Duration
|
|
9
|
+
from dstack._internal.utils.common import list_enum_values_for_annotation
|
|
9
10
|
from dstack._internal.utils.tags import tags_validator
|
|
10
11
|
|
|
11
12
|
DEFAULT_RETRY_DURATION = 3600
|
|
@@ -32,6 +33,17 @@ class TerminationPolicy(str, Enum):
|
|
|
32
33
|
DESTROY_AFTER_IDLE = "destroy-after-idle"
|
|
33
34
|
|
|
34
35
|
|
|
36
|
+
class StartupOrder(str, Enum):
|
|
37
|
+
ANY = "any"
|
|
38
|
+
MASTER_FIRST = "master-first"
|
|
39
|
+
WORKERS_FIRST = "workers-first"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class StopCriteria(str, Enum):
|
|
43
|
+
ALL_DONE = "all-done"
|
|
44
|
+
MASTER_DONE = "master-done"
|
|
45
|
+
|
|
46
|
+
|
|
35
47
|
@overload
|
|
36
48
|
def parse_duration(v: None) -> None: ...
|
|
37
49
|
|
|
@@ -102,7 +114,7 @@ class ProfileRetry(CoreModel):
|
|
|
102
114
|
Field(
|
|
103
115
|
description=(
|
|
104
116
|
"The list of events that should be handled with retry."
|
|
105
|
-
" Supported events are
|
|
117
|
+
f" Supported events are {list_enum_values_for_annotation(RetryEvent)}."
|
|
106
118
|
" Omit to retry on all events"
|
|
107
119
|
)
|
|
108
120
|
),
|
|
@@ -190,7 +202,11 @@ class ProfileParams(CoreModel):
|
|
|
190
202
|
spot_policy: Annotated[
|
|
191
203
|
Optional[SpotPolicy],
|
|
192
204
|
Field(
|
|
193
|
-
description=
|
|
205
|
+
description=(
|
|
206
|
+
"The policy for provisioning spot or on-demand instances:"
|
|
207
|
+
f" {list_enum_values_for_annotation(SpotPolicy)}."
|
|
208
|
+
f" Defaults to `{SpotPolicy.ONDEMAND.value}`"
|
|
209
|
+
)
|
|
194
210
|
),
|
|
195
211
|
] = None
|
|
196
212
|
retry: Annotated[
|
|
@@ -225,7 +241,11 @@ class ProfileParams(CoreModel):
|
|
|
225
241
|
creation_policy: Annotated[
|
|
226
242
|
Optional[CreationPolicy],
|
|
227
243
|
Field(
|
|
228
|
-
description=
|
|
244
|
+
description=(
|
|
245
|
+
"The policy for using instances from fleets:"
|
|
246
|
+
f" {list_enum_values_for_annotation(CreationPolicy)}."
|
|
247
|
+
f" Defaults to `{CreationPolicy.REUSE_OR_CREATE.value}`"
|
|
248
|
+
)
|
|
229
249
|
),
|
|
230
250
|
] = None
|
|
231
251
|
idle_duration: Annotated[
|
|
@@ -241,6 +261,26 @@ class ProfileParams(CoreModel):
|
|
|
241
261
|
Optional[UtilizationPolicy],
|
|
242
262
|
Field(description="Run termination policy based on utilization"),
|
|
243
263
|
] = None
|
|
264
|
+
startup_order: Annotated[
|
|
265
|
+
Optional[StartupOrder],
|
|
266
|
+
Field(
|
|
267
|
+
description=(
|
|
268
|
+
f"The order in which master and workers jobs are started:"
|
|
269
|
+
f" {list_enum_values_for_annotation(StartupOrder)}."
|
|
270
|
+
f" Defaults to `{StartupOrder.ANY.value}`"
|
|
271
|
+
)
|
|
272
|
+
),
|
|
273
|
+
] = None
|
|
274
|
+
stop_criteria: Annotated[
|
|
275
|
+
Optional[StopCriteria],
|
|
276
|
+
Field(
|
|
277
|
+
description=(
|
|
278
|
+
"The criteria determining when a multi-node run should be considered finished:"
|
|
279
|
+
f" {list_enum_values_for_annotation(StopCriteria)}."
|
|
280
|
+
f" Defaults to `{StopCriteria.ALL_DONE.value}`"
|
|
281
|
+
)
|
|
282
|
+
),
|
|
283
|
+
] = None
|
|
244
284
|
fleets: Annotated[
|
|
245
285
|
Optional[list[str]], Field(description="The fleets considered for reuse")
|
|
246
286
|
] = None
|
|
@@ -2,13 +2,18 @@ import tarfile
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import BinaryIO, Optional
|
|
4
4
|
|
|
5
|
+
import ignore
|
|
6
|
+
import ignore.overrides
|
|
5
7
|
from typing_extensions import Literal
|
|
6
8
|
|
|
7
9
|
from dstack._internal.core.models.repos.base import BaseRepoInfo, Repo
|
|
10
|
+
from dstack._internal.utils.common import sizeof_fmt
|
|
8
11
|
from dstack._internal.utils.hash import get_sha256, slugify
|
|
9
|
-
from dstack._internal.utils.
|
|
12
|
+
from dstack._internal.utils.logging import get_logger
|
|
10
13
|
from dstack._internal.utils.path import PathLike
|
|
11
14
|
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
12
17
|
|
|
13
18
|
class LocalRepoInfo(BaseRepoInfo):
|
|
14
19
|
repo_type: Literal["local"] = "local"
|
|
@@ -69,22 +74,23 @@ class LocalRepo(Repo):
|
|
|
69
74
|
self.run_repo_data = repo_data
|
|
70
75
|
|
|
71
76
|
def write_code_file(self, fp: BinaryIO) -> str:
|
|
77
|
+
repo_path = Path(self.run_repo_data.repo_dir)
|
|
72
78
|
with tarfile.TarFile(mode="w", fileobj=fp) as t:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
79
|
+
for entry in (
|
|
80
|
+
ignore.WalkBuilder(repo_path)
|
|
81
|
+
.overrides(ignore.overrides.OverrideBuilder(repo_path).add("!/.git/").build())
|
|
82
|
+
.hidden(False) # do not ignore files that start with a dot
|
|
83
|
+
.require_git(False) # respect git ignore rules even if not a git repo
|
|
84
|
+
.add_custom_ignore_filename(".dstackignore")
|
|
85
|
+
.build()
|
|
86
|
+
):
|
|
87
|
+
entry_path_within_repo = entry.path().relative_to(repo_path)
|
|
88
|
+
if entry_path_within_repo != Path("."):
|
|
89
|
+
t.add(entry.path(), arcname=entry_path_within_repo, recursive=False)
|
|
90
|
+
logger.debug("Code file size: %s", sizeof_fmt(fp.tell()))
|
|
78
91
|
return get_sha256(fp)
|
|
79
92
|
|
|
80
93
|
def get_repo_info(self) -> LocalRepoInfo:
|
|
81
94
|
return LocalRepoInfo(
|
|
82
95
|
repo_dir=self.run_repo_data.repo_dir,
|
|
83
96
|
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
class TarIgnore(GitIgnore):
|
|
87
|
-
def __call__(self, tarinfo: tarfile.TarInfo) -> Optional[tarfile.TarInfo]:
|
|
88
|
-
if self.ignore(tarinfo.path):
|
|
89
|
-
return None
|
|
90
|
-
return tarinfo
|
|
@@ -148,9 +148,6 @@ class JobTerminationReason(str, Enum):
|
|
|
148
148
|
}
|
|
149
149
|
return mapping[self]
|
|
150
150
|
|
|
151
|
-
def pretty_repr(self) -> str:
|
|
152
|
-
return " ".join(self.value.split("_")).capitalize()
|
|
153
|
-
|
|
154
151
|
|
|
155
152
|
class Requirements(CoreModel):
|
|
156
153
|
# TODO: Make requirements' fields required
|
|
@@ -289,6 +286,9 @@ class JobSubmission(CoreModel):
|
|
|
289
286
|
exit_status: Optional[int]
|
|
290
287
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
291
288
|
job_runtime_data: Optional[JobRuntimeData]
|
|
289
|
+
# TODO: make status_message and error a computed field after migrating to pydanticV2
|
|
290
|
+
status_message: Optional[str]
|
|
291
|
+
error: Optional[str] = None
|
|
292
292
|
|
|
293
293
|
@property
|
|
294
294
|
def age(self) -> timedelta:
|
|
@@ -301,6 +301,71 @@ class JobSubmission(CoreModel):
|
|
|
301
301
|
end_time = self.finished_at
|
|
302
302
|
return end_time - self.submitted_at
|
|
303
303
|
|
|
304
|
+
@root_validator
|
|
305
|
+
def _status_message(cls, values) -> Dict:
|
|
306
|
+
try:
|
|
307
|
+
status = values["status"]
|
|
308
|
+
termination_reason = values["termination_reason"]
|
|
309
|
+
exit_code = values["exit_status"]
|
|
310
|
+
except KeyError:
|
|
311
|
+
return values
|
|
312
|
+
values["status_message"] = JobSubmission._get_status_message(
|
|
313
|
+
status=status,
|
|
314
|
+
termination_reason=termination_reason,
|
|
315
|
+
exit_status=exit_code,
|
|
316
|
+
)
|
|
317
|
+
return values
|
|
318
|
+
|
|
319
|
+
@staticmethod
|
|
320
|
+
def _get_status_message(
|
|
321
|
+
status: JobStatus,
|
|
322
|
+
termination_reason: Optional[JobTerminationReason],
|
|
323
|
+
exit_status: Optional[int],
|
|
324
|
+
) -> str:
|
|
325
|
+
if status == JobStatus.DONE:
|
|
326
|
+
return "exited (0)"
|
|
327
|
+
elif status == JobStatus.FAILED:
|
|
328
|
+
if termination_reason == JobTerminationReason.CONTAINER_EXITED_WITH_ERROR:
|
|
329
|
+
return f"exited ({exit_status})"
|
|
330
|
+
elif termination_reason == JobTerminationReason.FAILED_TO_START_DUE_TO_NO_CAPACITY:
|
|
331
|
+
return "no offers"
|
|
332
|
+
elif termination_reason == JobTerminationReason.INTERRUPTED_BY_NO_CAPACITY:
|
|
333
|
+
return "interrupted"
|
|
334
|
+
else:
|
|
335
|
+
return "error"
|
|
336
|
+
elif status == JobStatus.TERMINATED:
|
|
337
|
+
if termination_reason == JobTerminationReason.TERMINATED_BY_USER:
|
|
338
|
+
return "stopped"
|
|
339
|
+
elif termination_reason == JobTerminationReason.ABORTED_BY_USER:
|
|
340
|
+
return "aborted"
|
|
341
|
+
return status.value
|
|
342
|
+
|
|
343
|
+
@root_validator
|
|
344
|
+
def _error(cls, values) -> Dict:
|
|
345
|
+
try:
|
|
346
|
+
termination_reason = values["termination_reason"]
|
|
347
|
+
except KeyError:
|
|
348
|
+
return values
|
|
349
|
+
values["error"] = JobSubmission._get_error(termination_reason=termination_reason)
|
|
350
|
+
return values
|
|
351
|
+
|
|
352
|
+
@staticmethod
|
|
353
|
+
def _get_error(termination_reason: Optional[JobTerminationReason]) -> Optional[str]:
|
|
354
|
+
error_mapping = {
|
|
355
|
+
JobTerminationReason.INSTANCE_UNREACHABLE: "instance unreachable",
|
|
356
|
+
JobTerminationReason.WAITING_INSTANCE_LIMIT_EXCEEDED: "waiting instance limit exceeded",
|
|
357
|
+
JobTerminationReason.VOLUME_ERROR: "waiting runner limit exceeded",
|
|
358
|
+
JobTerminationReason.GATEWAY_ERROR: "gateway error",
|
|
359
|
+
JobTerminationReason.SCALED_DOWN: "scaled down",
|
|
360
|
+
JobTerminationReason.INACTIVITY_DURATION_EXCEEDED: "inactivity duration exceeded",
|
|
361
|
+
JobTerminationReason.TERMINATED_DUE_TO_UTILIZATION_POLICY: "utilization policy",
|
|
362
|
+
JobTerminationReason.PORTS_BINDING_FAILED: "ports binding failed",
|
|
363
|
+
JobTerminationReason.CREATING_CONTAINER_ERROR: "runner error",
|
|
364
|
+
JobTerminationReason.EXECUTOR_ERROR: "executor error",
|
|
365
|
+
JobTerminationReason.MAX_DURATION_EXCEEDED: "max duration exceeded",
|
|
366
|
+
}
|
|
367
|
+
return error_mapping.get(termination_reason)
|
|
368
|
+
|
|
304
369
|
|
|
305
370
|
class Job(CoreModel):
|
|
306
371
|
job_spec: JobSpec
|
|
@@ -445,15 +510,20 @@ class Run(CoreModel):
|
|
|
445
510
|
def _error(cls, values) -> Dict:
|
|
446
511
|
try:
|
|
447
512
|
termination_reason = values["termination_reason"]
|
|
448
|
-
jobs = values["jobs"]
|
|
449
513
|
except KeyError:
|
|
450
514
|
return values
|
|
451
|
-
values["error"] =
|
|
452
|
-
run_termination_reason=termination_reason,
|
|
453
|
-
run_jobs=jobs,
|
|
454
|
-
)
|
|
515
|
+
values["error"] = Run._get_error(termination_reason=termination_reason)
|
|
455
516
|
return values
|
|
456
517
|
|
|
518
|
+
@staticmethod
|
|
519
|
+
def _get_error(termination_reason: Optional[RunTerminationReason]) -> Optional[str]:
|
|
520
|
+
if termination_reason == RunTerminationReason.RETRY_LIMIT_EXCEEDED:
|
|
521
|
+
return "retry limit exceeded"
|
|
522
|
+
elif termination_reason == RunTerminationReason.SERVER_ERROR:
|
|
523
|
+
return "server error"
|
|
524
|
+
else:
|
|
525
|
+
return None
|
|
526
|
+
|
|
457
527
|
|
|
458
528
|
class JobPlan(CoreModel):
|
|
459
529
|
job_spec: JobSpec
|
|
@@ -502,40 +572,3 @@ def get_policy_map(spot_policy: Optional[SpotPolicy], default: SpotPolicy) -> Op
|
|
|
502
572
|
SpotPolicy.ONDEMAND: False,
|
|
503
573
|
}
|
|
504
574
|
return policy_map[spot_policy]
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
def _get_run_error(
|
|
508
|
-
run_termination_reason: Optional[RunTerminationReason],
|
|
509
|
-
run_jobs: List[Job],
|
|
510
|
-
) -> str:
|
|
511
|
-
if run_termination_reason is None:
|
|
512
|
-
return ""
|
|
513
|
-
if len(run_jobs) > 1:
|
|
514
|
-
return run_termination_reason.name
|
|
515
|
-
run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
|
|
516
|
-
run_jobs
|
|
517
|
-
)
|
|
518
|
-
# For failed runs, also show termination reason to provide more context.
|
|
519
|
-
# For other run statuses, the job termination reason will duplicate run status.
|
|
520
|
-
if run_job_termination_reason is not None and run_termination_reason in [
|
|
521
|
-
RunTerminationReason.JOB_FAILED,
|
|
522
|
-
RunTerminationReason.SERVER_ERROR,
|
|
523
|
-
RunTerminationReason.RETRY_LIMIT_EXCEEDED,
|
|
524
|
-
]:
|
|
525
|
-
if exit_status:
|
|
526
|
-
return (
|
|
527
|
-
f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
|
|
528
|
-
)
|
|
529
|
-
return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
|
|
530
|
-
return run_termination_reason.name
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
def _get_run_job_termination_reason_and_exit_status(
|
|
534
|
-
run_jobs: List[Job],
|
|
535
|
-
) -> tuple[Optional[JobTerminationReason], Optional[int]]:
|
|
536
|
-
for job in run_jobs:
|
|
537
|
-
if len(job.job_submissions) > 0:
|
|
538
|
-
job_submission = job.job_submissions[-1]
|
|
539
|
-
if job_submission.termination_reason is not None:
|
|
540
|
-
return job_submission.termination_reason, job_submission.exit_status
|
|
541
|
-
return None, None
|