dstack 0.19.7__py3-none-any.whl → 0.19.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/services/args.py +2 -2
- dstack/_internal/cli/services/configurators/run.py +56 -13
- dstack/_internal/cli/utils/run.py +10 -5
- dstack/_internal/core/backends/aws/compute.py +13 -1
- dstack/_internal/core/backends/azure/compute.py +42 -13
- dstack/_internal/core/backends/azure/configurator.py +21 -0
- dstack/_internal/core/backends/azure/models.py +9 -0
- dstack/_internal/core/backends/base/compute.py +101 -27
- dstack/_internal/core/backends/base/offers.py +13 -3
- dstack/_internal/core/backends/cudo/compute.py +3 -1
- dstack/_internal/core/backends/datacrunch/compute.py +2 -0
- dstack/_internal/core/backends/gcp/auth.py +1 -1
- dstack/_internal/core/backends/gcp/compute.py +51 -35
- dstack/_internal/core/backends/lambdalabs/compute.py +20 -8
- dstack/_internal/core/backends/local/compute.py +2 -0
- dstack/_internal/core/backends/nebius/compute.py +95 -1
- dstack/_internal/core/backends/nebius/configurator.py +11 -0
- dstack/_internal/core/backends/nebius/fabrics.py +48 -0
- dstack/_internal/core/backends/nebius/models.py +9 -1
- dstack/_internal/core/backends/nebius/resources.py +29 -0
- dstack/_internal/core/backends/oci/compute.py +2 -0
- dstack/_internal/core/backends/remote/provisioning.py +27 -2
- dstack/_internal/core/backends/template/compute.py.jinja +2 -0
- dstack/_internal/core/backends/tensordock/compute.py +2 -0
- dstack/_internal/core/backends/vultr/compute.py +5 -1
- dstack/_internal/core/models/instances.py +2 -1
- dstack/_internal/core/models/resources.py +79 -4
- dstack/_internal/core/models/runs.py +26 -9
- dstack/_internal/core/models/volumes.py +1 -1
- dstack/_internal/server/background/tasks/process_fleets.py +4 -13
- dstack/_internal/server/background/tasks/process_instances.py +176 -55
- dstack/_internal/server/background/tasks/process_metrics.py +26 -9
- dstack/_internal/server/background/tasks/process_placement_groups.py +1 -1
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +5 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +56 -18
- dstack/_internal/server/migrations/versions/20166748b60c_add_jobmodel_disconnected_at.py +100 -0
- dstack/_internal/server/migrations/versions/6c1a9d6530ee_add_jobmodel_exit_status.py +26 -0
- dstack/_internal/server/models.py +6 -1
- dstack/_internal/server/schemas/runner.py +41 -8
- dstack/_internal/server/services/fleets.py +9 -26
- dstack/_internal/server/services/instances.py +0 -2
- dstack/_internal/server/services/jobs/__init__.py +1 -0
- dstack/_internal/server/services/offers.py +15 -0
- dstack/_internal/server/services/placement.py +27 -6
- dstack/_internal/server/services/resources.py +21 -0
- dstack/_internal/server/services/runner/client.py +7 -4
- dstack/_internal/server/services/runs.py +18 -8
- dstack/_internal/server/settings.py +20 -1
- dstack/_internal/server/testing/common.py +37 -26
- dstack/_internal/utils/common.py +13 -1
- dstack/_internal/utils/json_schema.py +6 -3
- dstack/api/__init__.py +1 -0
- dstack/api/server/_fleets.py +16 -0
- dstack/api/server/_runs.py +48 -3
- dstack/version.py +1 -1
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/METADATA +38 -29
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/RECORD +60 -56
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/WHEEL +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.7.dist-info → dstack-0.19.9.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -6,8 +6,9 @@ from textwrap import dedent
|
|
|
6
6
|
from typing import Any, Dict, Generator, List, Optional
|
|
7
7
|
|
|
8
8
|
import paramiko
|
|
9
|
-
from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
|
|
9
|
+
from gpuhunt import AcceleratorVendor, CPUArchitecture, correct_gpu_memory_gib
|
|
10
10
|
|
|
11
|
+
from dstack._internal.core.backends.base.compute import GoArchType, normalize_arch
|
|
11
12
|
from dstack._internal.core.consts import DSTACK_SHIM_HTTP_PORT
|
|
12
13
|
|
|
13
14
|
# FIXME: ProvisioningError is a subclass of ComputeError and should not be used outside of Compute
|
|
@@ -36,6 +37,22 @@ DSTACK_SHIM_ENV_FILE = "shim.env"
|
|
|
36
37
|
HOST_INFO_FILE = "host_info.json"
|
|
37
38
|
|
|
38
39
|
|
|
40
|
+
def detect_cpu_arch(client: paramiko.SSHClient) -> GoArchType:
|
|
41
|
+
cmd = "uname -m"
|
|
42
|
+
try:
|
|
43
|
+
_, stdout, stderr = client.exec_command(cmd, timeout=20)
|
|
44
|
+
except (paramiko.SSHException, OSError) as e:
|
|
45
|
+
raise ProvisioningError(f"detect_cpu_arch: {e}") from e
|
|
46
|
+
out = stdout.read().strip().decode()
|
|
47
|
+
err = stderr.read().strip().decode()
|
|
48
|
+
if err:
|
|
49
|
+
raise ProvisioningError(f"detect_cpu_arch: {cmd} failed, stdout: {out}, stderr: {err}")
|
|
50
|
+
try:
|
|
51
|
+
return normalize_arch(out)
|
|
52
|
+
except ValueError as e:
|
|
53
|
+
raise ProvisioningError(f"detect_cpu_arch: failed to normalize arch: {e}") from e
|
|
54
|
+
|
|
55
|
+
|
|
39
56
|
def sftp_upload(client: paramiko.SSHClient, path: str, body: str) -> None:
|
|
40
57
|
try:
|
|
41
58
|
sftp = client.open_sftp()
|
|
@@ -226,7 +243,14 @@ def get_shim_healthcheck(client: paramiko.SSHClient) -> str:
|
|
|
226
243
|
raise ProvisioningError(f"get_shim_healthcheck failed: {e}") from e
|
|
227
244
|
|
|
228
245
|
|
|
229
|
-
def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
|
|
246
|
+
def host_info_to_instance_type(host_info: Dict[str, Any], cpu_arch: GoArchType) -> InstanceType:
|
|
247
|
+
_cpu_arch: CPUArchitecture
|
|
248
|
+
if cpu_arch == "amd64":
|
|
249
|
+
_cpu_arch = CPUArchitecture.X86
|
|
250
|
+
elif cpu_arch == "arm64":
|
|
251
|
+
_cpu_arch = CPUArchitecture.ARM
|
|
252
|
+
else:
|
|
253
|
+
raise ValueError(f"Unexpected cpu_arch: {cpu_arch}")
|
|
230
254
|
gpu_count = host_info.get("gpu_count", 0)
|
|
231
255
|
if gpu_count > 0:
|
|
232
256
|
gpu_vendor = AcceleratorVendor.cast(host_info.get("gpu_vendor", "nvidia"))
|
|
@@ -251,6 +275,7 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
|
|
|
251
275
|
instance_type = InstanceType(
|
|
252
276
|
name="instance",
|
|
253
277
|
resources=Resources(
|
|
278
|
+
cpu_arch=_cpu_arch,
|
|
254
279
|
cpus=host_info["cpus"],
|
|
255
280
|
memory_mib=host_info["memory"] / 1024 / 1024,
|
|
256
281
|
spot=False,
|
|
@@ -18,6 +18,7 @@ from dstack._internal.core.models.instances import (
|
|
|
18
18
|
InstanceConfiguration,
|
|
19
19
|
InstanceOfferWithAvailability,
|
|
20
20
|
)
|
|
21
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
21
22
|
from dstack._internal.core.models.runs import Job, JobProvisioningData, Requirements, Run
|
|
22
23
|
from dstack._internal.core.models.volumes import Volume
|
|
23
24
|
from dstack._internal.utils.logging import get_logger
|
|
@@ -64,6 +65,7 @@ class {{ backend_name }}Compute(
|
|
|
64
65
|
self,
|
|
65
66
|
instance_offer: InstanceOfferWithAvailability,
|
|
66
67
|
instance_config: InstanceConfiguration,
|
|
68
|
+
placement_group: Optional[PlacementGroup],
|
|
67
69
|
) -> JobProvisioningData:
|
|
68
70
|
# TODO: Implement if backend supports creating instances (VM-based).
|
|
69
71
|
# Delete if backend can only run jobs (container-based).
|
|
@@ -19,6 +19,7 @@ from dstack._internal.core.models.instances import (
|
|
|
19
19
|
InstanceConfiguration,
|
|
20
20
|
InstanceOfferWithAvailability,
|
|
21
21
|
)
|
|
22
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
22
23
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
23
24
|
from dstack._internal.utils.logging import get_logger
|
|
24
25
|
|
|
@@ -57,6 +58,7 @@ class TensorDockCompute(
|
|
|
57
58
|
self,
|
|
58
59
|
instance_offer: InstanceOfferWithAvailability,
|
|
59
60
|
instance_config: InstanceConfiguration,
|
|
61
|
+
placement_group: Optional[PlacementGroup],
|
|
60
62
|
) -> JobProvisioningData:
|
|
61
63
|
instance_name = generate_unique_instance_name(
|
|
62
64
|
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
@@ -22,6 +22,7 @@ from dstack._internal.core.models.instances import (
|
|
|
22
22
|
InstanceOffer,
|
|
23
23
|
InstanceOfferWithAvailability,
|
|
24
24
|
)
|
|
25
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
25
26
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
26
27
|
from dstack._internal.utils.logging import get_logger
|
|
27
28
|
|
|
@@ -58,7 +59,10 @@ class VultrCompute(
|
|
|
58
59
|
return offers
|
|
59
60
|
|
|
60
61
|
def create_instance(
|
|
61
|
-
self,
|
|
62
|
+
self,
|
|
63
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
64
|
+
instance_config: InstanceConfiguration,
|
|
65
|
+
placement_group: Optional[PlacementGroup],
|
|
62
66
|
) -> JobProvisioningData:
|
|
63
67
|
instance_name = generate_unique_instance_name(
|
|
64
68
|
instance_config, max_length=MAX_INSTANCE_NAME_LEN
|
|
@@ -49,11 +49,13 @@ class Resources(CoreModel):
|
|
|
49
49
|
spot: bool
|
|
50
50
|
disk: Disk = Disk(size_mib=102400) # the default value (100GB) for backward compatibility
|
|
51
51
|
description: str = ""
|
|
52
|
+
cpu_arch: Optional[gpuhunt.CPUArchitecture] = None
|
|
52
53
|
|
|
53
54
|
def pretty_format(self, include_spot: bool = False) -> str:
|
|
54
55
|
resources = {}
|
|
55
56
|
if self.cpus > 0:
|
|
56
57
|
resources["cpus"] = self.cpus
|
|
58
|
+
resources["cpu_arch"] = self.cpu_arch
|
|
57
59
|
if self.memory_mib > 0:
|
|
58
60
|
resources["memory"] = f"{self.memory_mib / 1024:.0f}GB"
|
|
59
61
|
if self.disk.size_mib > 0:
|
|
@@ -105,7 +107,6 @@ class InstanceConfiguration(CoreModel):
|
|
|
105
107
|
user: str # dstack user name
|
|
106
108
|
ssh_keys: List[SSHKey]
|
|
107
109
|
instance_id: Optional[str] = None
|
|
108
|
-
placement_group_name: Optional[str] = None
|
|
109
110
|
reservation: Optional[str] = None
|
|
110
111
|
volumes: Optional[List[Volume]] = None
|
|
111
112
|
tags: Optional[Dict[str, str]] = None
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import math
|
|
2
|
+
from collections.abc import Mapping
|
|
2
3
|
from typing import Any, Dict, Generic, List, Optional, Tuple, TypeVar, Union
|
|
3
4
|
|
|
4
5
|
import gpuhunt
|
|
5
|
-
from pydantic import Field, root_validator, validator
|
|
6
|
+
from pydantic import Field, parse_obj_as, root_validator, validator
|
|
6
7
|
from pydantic.generics import GenericModel
|
|
7
8
|
from typing_extensions import Annotated
|
|
8
9
|
|
|
@@ -125,7 +126,68 @@ class ComputeCapability(Tuple[int, int]):
|
|
|
125
126
|
|
|
126
127
|
DEFAULT_CPU_COUNT = Range[int](min=2)
|
|
127
128
|
DEFAULT_MEMORY_SIZE = Range[Memory](min=Memory.parse("8GB"))
|
|
128
|
-
DEFAULT_GPU_COUNT = Range[int](min=1
|
|
129
|
+
DEFAULT_GPU_COUNT = Range[int](min=1)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class CPUSpec(CoreModel):
|
|
133
|
+
class Config:
|
|
134
|
+
@staticmethod
|
|
135
|
+
def schema_extra(schema: Dict[str, Any]):
|
|
136
|
+
add_extra_schema_types(
|
|
137
|
+
schema["properties"]["count"],
|
|
138
|
+
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
arch: Annotated[
|
|
142
|
+
Optional[gpuhunt.CPUArchitecture],
|
|
143
|
+
Field(description="The CPU architecture, one of: `x86`, `arm`"),
|
|
144
|
+
] = None
|
|
145
|
+
count: Annotated[Range[int], Field(description="The number of CPU cores")] = DEFAULT_CPU_COUNT
|
|
146
|
+
|
|
147
|
+
@classmethod
|
|
148
|
+
def __get_validators__(cls):
|
|
149
|
+
yield cls.parse
|
|
150
|
+
yield cls.validate
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def parse(cls, v: Any) -> Any:
|
|
154
|
+
if isinstance(v, int):
|
|
155
|
+
v = str(v)
|
|
156
|
+
if isinstance(v, str):
|
|
157
|
+
tokens = v.replace(" ", "").split(":")
|
|
158
|
+
spec = {}
|
|
159
|
+
for token in tokens:
|
|
160
|
+
if not token:
|
|
161
|
+
raise ValueError(f"CPU spec contains empty token: {v}")
|
|
162
|
+
if ".." in token or token.isdigit():
|
|
163
|
+
if "count" in spec:
|
|
164
|
+
raise ValueError(f"CPU spec count conflict: {v}")
|
|
165
|
+
spec["count"] = token
|
|
166
|
+
else:
|
|
167
|
+
try:
|
|
168
|
+
arch = gpuhunt.CPUArchitecture.cast(token)
|
|
169
|
+
except ValueError:
|
|
170
|
+
raise ValueError(f"Invalid CPU architecture: {v}")
|
|
171
|
+
if "arch" in spec:
|
|
172
|
+
raise ValueError(f"CPU spec arch conflict: {v}")
|
|
173
|
+
spec["arch"] = arch
|
|
174
|
+
return spec
|
|
175
|
+
# Range and min/max dict - for backward compatibility
|
|
176
|
+
if isinstance(v, Range):
|
|
177
|
+
return {"arch": None, "count": v}
|
|
178
|
+
if isinstance(v, Mapping) and v.keys() == {"min", "max"}:
|
|
179
|
+
return {"arch": None, "count": v}
|
|
180
|
+
return v
|
|
181
|
+
|
|
182
|
+
@validator("arch", pre=True)
|
|
183
|
+
def _validate_arch(cls, v: Any) -> Any:
|
|
184
|
+
if v is None:
|
|
185
|
+
return None
|
|
186
|
+
if isinstance(v, gpuhunt.CPUArchitecture):
|
|
187
|
+
return v
|
|
188
|
+
if isinstance(v, str):
|
|
189
|
+
return gpuhunt.CPUArchitecture.cast(v)
|
|
190
|
+
return v
|
|
129
191
|
|
|
130
192
|
|
|
131
193
|
class GPUSpec(CoreModel):
|
|
@@ -302,7 +364,10 @@ class ResourcesSpec(CoreModel):
|
|
|
302
364
|
extra_types=[{"type": "integer"}, {"type": "string"}],
|
|
303
365
|
)
|
|
304
366
|
|
|
305
|
-
|
|
367
|
+
# TODO: Remove Range[int] in 0.20. Range[int] for backward compatibility only.
|
|
368
|
+
cpu: Annotated[Union[CPUSpec, Range[int]], Field(description="The CPU requirements")] = (
|
|
369
|
+
CPUSpec()
|
|
370
|
+
)
|
|
306
371
|
memory: Annotated[Range[Memory], Field(description="The RAM size (e.g., `8GB`)")] = (
|
|
307
372
|
DEFAULT_MEMORY_SIZE
|
|
308
373
|
)
|
|
@@ -317,8 +382,18 @@ class ResourcesSpec(CoreModel):
|
|
|
317
382
|
gpu: Annotated[Optional[GPUSpec], Field(description="The GPU requirements")] = None
|
|
318
383
|
disk: Annotated[Optional[DiskSpec], Field(description="The disk resources")] = DEFAULT_DISK
|
|
319
384
|
|
|
385
|
+
# TODO: Remove in 0.20. Added for backward compatibility.
|
|
386
|
+
@root_validator
|
|
387
|
+
def _post_validate(cls, values):
|
|
388
|
+
cpu = values.get("cpu")
|
|
389
|
+
if isinstance(cpu, CPUSpec) and cpu.arch in [None, gpuhunt.CPUArchitecture.X86]:
|
|
390
|
+
values["cpu"] = cpu.count
|
|
391
|
+
return values
|
|
392
|
+
|
|
320
393
|
def pretty_format(self) -> str:
|
|
321
|
-
|
|
394
|
+
# TODO: Remove in 0.20. Use self.cpu directly
|
|
395
|
+
cpu = parse_obj_as(CPUSpec, self.cpu)
|
|
396
|
+
resources: Dict[str, Any] = dict(cpu_arch=cpu.arch, cpus=cpu.count, memory=self.memory)
|
|
322
397
|
if self.gpu:
|
|
323
398
|
gpu = self.gpu
|
|
324
399
|
resources.update(
|
|
@@ -104,6 +104,7 @@ class JobTerminationReason(str, Enum):
|
|
|
104
104
|
# Set by the server
|
|
105
105
|
FAILED_TO_START_DUE_TO_NO_CAPACITY = "failed_to_start_due_to_no_capacity"
|
|
106
106
|
INTERRUPTED_BY_NO_CAPACITY = "interrupted_by_no_capacity"
|
|
107
|
+
INSTANCE_UNREACHABLE = "instance_unreachable"
|
|
107
108
|
WAITING_INSTANCE_LIMIT_EXCEEDED = "waiting_instance_limit_exceeded"
|
|
108
109
|
WAITING_RUNNER_LIMIT_EXCEEDED = "waiting_runner_limit_exceeded"
|
|
109
110
|
TERMINATED_BY_USER = "terminated_by_user"
|
|
@@ -126,6 +127,7 @@ class JobTerminationReason(str, Enum):
|
|
|
126
127
|
mapping = {
|
|
127
128
|
self.FAILED_TO_START_DUE_TO_NO_CAPACITY: JobStatus.FAILED,
|
|
128
129
|
self.INTERRUPTED_BY_NO_CAPACITY: JobStatus.FAILED,
|
|
130
|
+
self.INSTANCE_UNREACHABLE: JobStatus.FAILED,
|
|
129
131
|
self.WAITING_INSTANCE_LIMIT_EXCEEDED: JobStatus.FAILED,
|
|
130
132
|
self.WAITING_RUNNER_LIMIT_EXCEEDED: JobStatus.FAILED,
|
|
131
133
|
self.TERMINATED_BY_USER: JobStatus.TERMINATED,
|
|
@@ -262,9 +264,9 @@ class JobRuntimeData(CoreModel):
|
|
|
262
264
|
# or not applicable (container-based backends)
|
|
263
265
|
ports: Optional[dict[int, int]] = None
|
|
264
266
|
# List of volumes used by the job
|
|
265
|
-
volume_names: Optional[list[str]] = None # None for backward
|
|
267
|
+
volume_names: Optional[list[str]] = None # None for backward compatibility
|
|
266
268
|
# Virtual shared offer
|
|
267
|
-
offer: Optional[InstanceOfferWithAvailability] = None # None for backward
|
|
269
|
+
offer: Optional[InstanceOfferWithAvailability] = None # None for backward compatibility
|
|
268
270
|
|
|
269
271
|
|
|
270
272
|
class ClusterInfo(CoreModel):
|
|
@@ -283,6 +285,7 @@ class JobSubmission(CoreModel):
|
|
|
283
285
|
status: JobStatus
|
|
284
286
|
termination_reason: Optional[JobTerminationReason]
|
|
285
287
|
termination_reason_message: Optional[str]
|
|
288
|
+
exit_status: Optional[int]
|
|
286
289
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
287
290
|
job_runtime_data: Optional[JobRuntimeData]
|
|
288
291
|
|
|
@@ -439,9 +442,14 @@ class Run(CoreModel):
|
|
|
439
442
|
|
|
440
443
|
@root_validator
|
|
441
444
|
def _error(cls, values) -> Dict:
|
|
445
|
+
try:
|
|
446
|
+
termination_reason = values["termination_reason"]
|
|
447
|
+
jobs = values["jobs"]
|
|
448
|
+
except KeyError:
|
|
449
|
+
return values
|
|
442
450
|
values["error"] = _get_run_error(
|
|
443
|
-
run_termination_reason=
|
|
444
|
-
run_jobs=
|
|
451
|
+
run_termination_reason=termination_reason,
|
|
452
|
+
run_jobs=jobs,
|
|
445
453
|
)
|
|
446
454
|
return values
|
|
447
455
|
|
|
@@ -503,7 +511,9 @@ def _get_run_error(
|
|
|
503
511
|
return ""
|
|
504
512
|
if len(run_jobs) > 1:
|
|
505
513
|
return run_termination_reason.name
|
|
506
|
-
run_job_termination_reason =
|
|
514
|
+
run_job_termination_reason, exit_status = _get_run_job_termination_reason_and_exit_status(
|
|
515
|
+
run_jobs
|
|
516
|
+
)
|
|
507
517
|
# For failed runs, also show termination reason to provide more context.
|
|
508
518
|
# For other run statuses, the job termination reason will duplicate run status.
|
|
509
519
|
if run_job_termination_reason is not None and run_termination_reason in [
|
|
@@ -511,13 +521,20 @@ def _get_run_error(
|
|
|
511
521
|
RunTerminationReason.SERVER_ERROR,
|
|
512
522
|
RunTerminationReason.RETRY_LIMIT_EXCEEDED,
|
|
513
523
|
]:
|
|
524
|
+
if exit_status:
|
|
525
|
+
return (
|
|
526
|
+
f"{run_termination_reason.name}\n({run_job_termination_reason.name} {exit_status})"
|
|
527
|
+
)
|
|
514
528
|
return f"{run_termination_reason.name}\n({run_job_termination_reason.name})"
|
|
515
529
|
return run_termination_reason.name
|
|
516
530
|
|
|
517
531
|
|
|
518
|
-
def
|
|
532
|
+
def _get_run_job_termination_reason_and_exit_status(
|
|
533
|
+
run_jobs: List[Job],
|
|
534
|
+
) -> tuple[Optional[JobTerminationReason], Optional[int]]:
|
|
519
535
|
for job in run_jobs:
|
|
520
536
|
if len(job.job_submissions) > 0:
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
537
|
+
job_submission = job.job_submissions[-1]
|
|
538
|
+
if job_submission.termination_reason is not None:
|
|
539
|
+
return job_submission.termination_reason, job_submission.exit_status
|
|
540
|
+
return None, None
|
|
@@ -159,7 +159,7 @@ class VolumeMountPoint(CoreModel):
|
|
|
159
159
|
description=(
|
|
160
160
|
"The network volume name or the list of network volume names to mount."
|
|
161
161
|
" If a list is specified, one of the volumes in the list will be mounted."
|
|
162
|
-
" Specify volumes from different backends/regions to increase availability
|
|
162
|
+
" Specify volumes from different backends/regions to increase availability"
|
|
163
163
|
)
|
|
164
164
|
),
|
|
165
165
|
]
|
|
@@ -1,15 +1,16 @@
|
|
|
1
|
-
from sqlalchemy import select
|
|
1
|
+
from sqlalchemy import select
|
|
2
2
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
3
3
|
from sqlalchemy.orm import joinedload
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
6
6
|
from dstack._internal.server.db import get_session_ctx
|
|
7
|
-
from dstack._internal.server.models import FleetModel
|
|
7
|
+
from dstack._internal.server.models import FleetModel
|
|
8
8
|
from dstack._internal.server.services.fleets import (
|
|
9
9
|
is_fleet_empty,
|
|
10
10
|
is_fleet_in_use,
|
|
11
11
|
)
|
|
12
12
|
from dstack._internal.server.services.locking import get_locker
|
|
13
|
+
from dstack._internal.server.services.placement import schedule_fleet_placement_groups_deletion
|
|
13
14
|
from dstack._internal.utils.common import get_current_datetime
|
|
14
15
|
from dstack._internal.utils.logging import get_logger
|
|
15
16
|
|
|
@@ -68,16 +69,6 @@ async def _autodelete_fleet(session: AsyncSession, fleet_model: FleetModel):
|
|
|
68
69
|
fleet_model.status = FleetStatus.TERMINATED
|
|
69
70
|
fleet_model.deleted = True
|
|
70
71
|
fleet_model.last_processed_at = get_current_datetime()
|
|
71
|
-
await
|
|
72
|
+
await schedule_fleet_placement_groups_deletion(session=session, fleet_id=fleet_model.id)
|
|
72
73
|
await session.commit()
|
|
73
74
|
logger.info("Fleet %s deleted", fleet_model.name)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
async def _mark_placement_groups_as_ready_for_deletion(
|
|
77
|
-
session: AsyncSession, fleet_model: FleetModel
|
|
78
|
-
):
|
|
79
|
-
await session.execute(
|
|
80
|
-
update(PlacementGroupModel)
|
|
81
|
-
.where(PlacementGroupModel.fleet_id == fleet_model.id)
|
|
82
|
-
.values(fleet_deleted=True)
|
|
83
|
-
)
|