dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +8 -3
- dstack/_internal/cli/services/configurators/__init__.py +8 -0
- dstack/_internal/cli/services/configurators/fleet.py +1 -1
- dstack/_internal/cli/services/configurators/gateway.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +11 -1
- dstack/_internal/cli/services/configurators/volume.py +1 -1
- dstack/_internal/cli/utils/common.py +48 -5
- dstack/_internal/cli/utils/fleet.py +5 -5
- dstack/_internal/cli/utils/run.py +32 -0
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +225 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/backends/nebius/compute.py +8 -2
- dstack/_internal/core/backends/nebius/fabrics.py +1 -0
- dstack/_internal/core/backends/nebius/resources.py +9 -0
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +139 -1
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +2 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/runs.py +21 -1
- dstack/_internal/core/services/ssh/tunnel.py +7 -0
- dstack/_internal/server/app.py +4 -0
- dstack/_internal/server/background/__init__.py +4 -0
- dstack/_internal/server/background/tasks/process_instances.py +107 -56
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +13 -0
- dstack/_internal/server/background/tasks/process_runs.py +21 -14
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/models.py +41 -0
- dstack/_internal/server/routers/instances.py +33 -5
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +32 -0
- dstack/_internal/server/schemas/runner.py +5 -0
- dstack/_internal/server/services/instances.py +103 -1
- dstack/_internal/server/services/jobs/__init__.py +8 -1
- dstack/_internal/server/services/jobs/configurators/base.py +26 -0
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/aws.py +13 -1
- dstack/_internal/server/services/logs/gcp.py +16 -1
- dstack/_internal/server/services/probes.py +6 -0
- dstack/_internal/server/services/projects.py +16 -4
- dstack/_internal/server/services/runner/client.py +52 -20
- dstack/_internal/server/services/runner/ssh.py +4 -4
- dstack/_internal/server/services/runs.py +49 -13
- dstack/_internal/server/services/ssh.py +66 -0
- dstack/_internal/server/settings.py +13 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
- dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
- dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
- dstack/_internal/server/testing/common.py +44 -0
- dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +15 -0
- dstack/api/server/__init__.py +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/METADATA +14 -14
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/RECORD +71 -58
- /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/WHEEL +0 -0
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.backends.base.configurator import (
|
|
4
|
+
BackendRecord,
|
|
5
|
+
Configurator,
|
|
6
|
+
)
|
|
7
|
+
from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
|
|
8
|
+
from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend
|
|
9
|
+
from dstack._internal.core.backends.hotaisle.models import (
|
|
10
|
+
AnyHotAisleBackendConfig,
|
|
11
|
+
AnyHotAisleCreds,
|
|
12
|
+
HotAisleBackendConfig,
|
|
13
|
+
HotAisleBackendConfigWithCreds,
|
|
14
|
+
HotAisleConfig,
|
|
15
|
+
HotAisleCreds,
|
|
16
|
+
HotAisleStoredConfig,
|
|
17
|
+
)
|
|
18
|
+
from dstack._internal.core.models.backends.base import (
|
|
19
|
+
BackendType,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HotAisleConfigurator(Configurator):
|
|
24
|
+
TYPE = BackendType.HOTAISLE
|
|
25
|
+
BACKEND_CLASS = HotAisleBackend
|
|
26
|
+
|
|
27
|
+
def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool):
|
|
28
|
+
self._validate_creds(config.creds, config.team_handle)
|
|
29
|
+
|
|
30
|
+
def create_backend(
|
|
31
|
+
self, project_name: str, config: HotAisleBackendConfigWithCreds
|
|
32
|
+
) -> BackendRecord:
|
|
33
|
+
return BackendRecord(
|
|
34
|
+
config=HotAisleStoredConfig(
|
|
35
|
+
**HotAisleBackendConfig.__response__.parse_obj(config).dict()
|
|
36
|
+
).json(),
|
|
37
|
+
auth=HotAisleCreds.parse_obj(config.creds).json(),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def get_backend_config(
|
|
41
|
+
self, record: BackendRecord, include_creds: bool
|
|
42
|
+
) -> AnyHotAisleBackendConfig:
|
|
43
|
+
config = self._get_config(record)
|
|
44
|
+
if include_creds:
|
|
45
|
+
return HotAisleBackendConfigWithCreds.__response__.parse_obj(config)
|
|
46
|
+
return HotAisleBackendConfig.__response__.parse_obj(config)
|
|
47
|
+
|
|
48
|
+
def get_backend(self, record: BackendRecord) -> HotAisleBackend:
|
|
49
|
+
config = self._get_config(record)
|
|
50
|
+
return HotAisleBackend(config=config)
|
|
51
|
+
|
|
52
|
+
def _get_config(self, record: BackendRecord) -> HotAisleConfig:
|
|
53
|
+
return HotAisleConfig.__response__(
|
|
54
|
+
**json.loads(record.config),
|
|
55
|
+
creds=HotAisleCreds.parse_raw(record.auth),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str):
|
|
59
|
+
api_client = HotAisleAPIClient(creds.api_key, team_handle)
|
|
60
|
+
api_client.validate_api_key()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Annotated, List, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HotAisleAPIKeyCreds(CoreModel):
|
|
9
|
+
type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
|
|
10
|
+
api_key: Annotated[str, Field(description="The Hot Aisle API key")]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
AnyHotAisleCreds = HotAisleAPIKeyCreds
|
|
14
|
+
HotAisleCreds = AnyHotAisleCreds
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HotAisleBackendConfig(CoreModel):
|
|
18
|
+
type: Annotated[
|
|
19
|
+
Literal["hotaisle"],
|
|
20
|
+
Field(description="The type of backend"),
|
|
21
|
+
] = "hotaisle"
|
|
22
|
+
team_handle: Annotated[str, Field(description="The Hot Aisle team handle")]
|
|
23
|
+
regions: Annotated[
|
|
24
|
+
Optional[List[str]],
|
|
25
|
+
Field(description="The list of Hot Aisle regions. Omit to use all regions"),
|
|
26
|
+
] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HotAisleBackendConfigWithCreds(HotAisleBackendConfig):
|
|
30
|
+
creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig):
|
|
37
|
+
creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class HotAisleStoredConfig(HotAisleBackendConfig):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class HotAisleConfig(HotAisleStoredConfig):
|
|
45
|
+
creds: AnyHotAisleCreds
|
|
@@ -206,10 +206,11 @@ def _launch_runner(
|
|
|
206
206
|
ssh_private_key: str,
|
|
207
207
|
launch_command: str,
|
|
208
208
|
):
|
|
209
|
+
daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
|
|
209
210
|
_run_ssh_command(
|
|
210
211
|
hostname=hostname,
|
|
211
212
|
ssh_private_key=ssh_private_key,
|
|
212
|
-
command=
|
|
213
|
+
command=daemonized_command,
|
|
213
214
|
)
|
|
214
215
|
|
|
215
216
|
|
|
@@ -29,6 +29,11 @@ from dstack._internal.core.backends.gcp.models import (
|
|
|
29
29
|
GCPBackendConfigWithCreds,
|
|
30
30
|
GCPBackendFileConfigWithCreds,
|
|
31
31
|
)
|
|
32
|
+
from dstack._internal.core.backends.hotaisle.models import (
|
|
33
|
+
HotAisleBackendConfig,
|
|
34
|
+
HotAisleBackendConfigWithCreds,
|
|
35
|
+
HotAisleBackendFileConfigWithCreds,
|
|
36
|
+
)
|
|
32
37
|
from dstack._internal.core.backends.kubernetes.models import (
|
|
33
38
|
KubernetesBackendConfig,
|
|
34
39
|
KubernetesBackendConfigWithCreds,
|
|
@@ -73,6 +78,7 @@ AnyBackendConfigWithoutCreds = Union[
|
|
|
73
78
|
CudoBackendConfig,
|
|
74
79
|
DataCrunchBackendConfig,
|
|
75
80
|
GCPBackendConfig,
|
|
81
|
+
HotAisleBackendConfig,
|
|
76
82
|
KubernetesBackendConfig,
|
|
77
83
|
LambdaBackendConfig,
|
|
78
84
|
NebiusBackendConfig,
|
|
@@ -95,6 +101,7 @@ AnyBackendConfigWithCreds = Union[
|
|
|
95
101
|
CudoBackendConfigWithCreds,
|
|
96
102
|
DataCrunchBackendConfigWithCreds,
|
|
97
103
|
GCPBackendConfigWithCreds,
|
|
104
|
+
HotAisleBackendConfigWithCreds,
|
|
98
105
|
KubernetesBackendConfigWithCreds,
|
|
99
106
|
LambdaBackendConfigWithCreds,
|
|
100
107
|
OCIBackendConfigWithCreds,
|
|
@@ -116,6 +123,7 @@ AnyBackendFileConfigWithCreds = Union[
|
|
|
116
123
|
CudoBackendConfigWithCreds,
|
|
117
124
|
DataCrunchBackendConfigWithCreds,
|
|
118
125
|
GCPBackendFileConfigWithCreds,
|
|
126
|
+
HotAisleBackendFileConfigWithCreds,
|
|
119
127
|
KubernetesBackendFileConfigWithCreds,
|
|
120
128
|
LambdaBackendConfigWithCreds,
|
|
121
129
|
OCIBackendConfigWithCreds,
|
|
@@ -74,6 +74,7 @@ SETUP_COMMANDS = [
|
|
|
74
74
|
SUPPORTED_PLATFORMS = [
|
|
75
75
|
"gpu-h100-sxm",
|
|
76
76
|
"gpu-h200-sxm",
|
|
77
|
+
"gpu-b200-sxm",
|
|
77
78
|
"gpu-l40s-a",
|
|
78
79
|
"gpu-l40s-d",
|
|
79
80
|
"cpu-d3",
|
|
@@ -150,12 +151,16 @@ class NebiusCompute(
|
|
|
150
151
|
)
|
|
151
152
|
if backend_data.cluster is not None:
|
|
152
153
|
cluster_id = backend_data.cluster.id
|
|
154
|
+
|
|
155
|
+
gpus = instance_offer.instance.resources.gpus
|
|
153
156
|
create_disk_op = resources.create_disk(
|
|
154
157
|
sdk=self._sdk,
|
|
155
158
|
name=instance_name,
|
|
156
159
|
project_id=self._region_to_project_id[instance_offer.region],
|
|
157
160
|
size_mib=instance_offer.instance.resources.disk.size_mib,
|
|
158
|
-
image_family="
|
|
161
|
+
image_family="ubuntu24.04-cuda12"
|
|
162
|
+
if gpus and gpus[0].name == "B200"
|
|
163
|
+
else "ubuntu22.04-cuda12",
|
|
159
164
|
)
|
|
160
165
|
create_instance_op = None
|
|
161
166
|
try:
|
|
@@ -180,6 +185,7 @@ class NebiusCompute(
|
|
|
180
185
|
cluster_id=cluster_id,
|
|
181
186
|
disk_id=create_disk_op.resource_id,
|
|
182
187
|
subnet_id=self._get_subnet_id(instance_offer.region),
|
|
188
|
+
preemptible=instance_offer.instance.resources.spot,
|
|
183
189
|
)
|
|
184
190
|
_wait_for_instance(self._sdk, create_instance_op)
|
|
185
191
|
except BaseException:
|
|
@@ -367,4 +373,4 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
|
|
|
367
373
|
|
|
368
374
|
def _supported_instances(offer: InstanceOffer) -> bool:
|
|
369
375
|
platform, _ = offer.instance.name.split()
|
|
370
|
-
return platform in SUPPORTED_PLATFORMS
|
|
376
|
+
return platform in SUPPORTED_PLATFORMS
|
|
@@ -21,6 +21,7 @@ INFINIBAND_FABRICS = [
|
|
|
21
21
|
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
|
|
22
22
|
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
|
|
23
23
|
InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
|
|
24
|
+
InfinibandFabric("us-central1-b", "gpu-b200-sxm", "us-central1"),
|
|
24
25
|
]
|
|
25
26
|
|
|
26
27
|
|
|
@@ -28,10 +28,12 @@ from nebius.api.nebius.compute.v1 import (
|
|
|
28
28
|
GpuClusterSpec,
|
|
29
29
|
Instance,
|
|
30
30
|
InstanceGpuClusterSpec,
|
|
31
|
+
InstanceRecoveryPolicy,
|
|
31
32
|
InstanceServiceClient,
|
|
32
33
|
InstanceSpec,
|
|
33
34
|
IPAddress,
|
|
34
35
|
NetworkInterfaceSpec,
|
|
36
|
+
PreemptibleSpec,
|
|
35
37
|
PublicIPAddress,
|
|
36
38
|
ResourcesSpec,
|
|
37
39
|
SourceImageFamily,
|
|
@@ -283,6 +285,7 @@ def create_instance(
|
|
|
283
285
|
cluster_id: Optional[str],
|
|
284
286
|
disk_id: str,
|
|
285
287
|
subnet_id: str,
|
|
288
|
+
preemptible: bool,
|
|
286
289
|
) -> SDKOperation[Operation]:
|
|
287
290
|
client = InstanceServiceClient(sdk)
|
|
288
291
|
request = CreateInstanceRequest(
|
|
@@ -306,6 +309,12 @@ def create_instance(
|
|
|
306
309
|
public_ip_address=PublicIPAddress(static=True),
|
|
307
310
|
)
|
|
308
311
|
],
|
|
312
|
+
preemptible=PreemptibleSpec(
|
|
313
|
+
priority=1, on_preemption=PreemptibleSpec.PreemptionPolicy.STOP
|
|
314
|
+
)
|
|
315
|
+
if preemptible
|
|
316
|
+
else None,
|
|
317
|
+
recovery_policy=InstanceRecoveryPolicy.FAIL if preemptible else None,
|
|
309
318
|
),
|
|
310
319
|
)
|
|
311
320
|
with wrap_capacity_errors():
|
|
@@ -53,6 +53,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
53
53
|
job_submissions_excludes["exit_status"] = True
|
|
54
54
|
if all(js.deployment_num == 0 for js in job_submissions):
|
|
55
55
|
job_submissions_excludes["deployment_num"] = True
|
|
56
|
+
if all(not js.probes for js in job_submissions):
|
|
57
|
+
job_submissions_excludes["probes"] = True
|
|
56
58
|
latest_job_submission = current_resource.latest_job_submission
|
|
57
59
|
if latest_job_submission is not None:
|
|
58
60
|
latest_job_submission_excludes: IncludeExcludeDictType = {}
|
|
@@ -69,6 +71,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
69
71
|
latest_job_submission_excludes["exit_status"] = True
|
|
70
72
|
if latest_job_submission.deployment_num == 0:
|
|
71
73
|
latest_job_submission_excludes["deployment_num"] = True
|
|
74
|
+
if not latest_job_submission.probes:
|
|
75
|
+
latest_job_submission_excludes["probes"] = True
|
|
72
76
|
return {"plan": apply_plan_excludes}
|
|
73
77
|
|
|
74
78
|
|
|
@@ -120,6 +124,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
|
|
|
120
124
|
profile_excludes.add("startup_order")
|
|
121
125
|
if configuration.stop_criteria is None:
|
|
122
126
|
configuration_excludes["stop_criteria"] = True
|
|
127
|
+
if isinstance(configuration, ServiceConfiguration) and not configuration.probes:
|
|
128
|
+
configuration_excludes["probes"] = True
|
|
123
129
|
if profile is not None and profile.stop_criteria is None:
|
|
124
130
|
profile_excludes.add("stop_criteria")
|
|
125
131
|
if not configuration.files:
|
|
@@ -154,6 +160,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
|
|
|
154
160
|
spec_excludes["file_archives"] = True
|
|
155
161
|
if all(s.service_port is None for s in job_specs):
|
|
156
162
|
spec_excludes["service_port"] = True
|
|
163
|
+
if all(not s.probes for s in job_specs):
|
|
164
|
+
spec_excludes["probes"] = True
|
|
157
165
|
|
|
158
166
|
return spec_excludes
|
|
159
167
|
|
|
@@ -11,6 +11,7 @@ class BackendType(str, enum.Enum):
|
|
|
11
11
|
DSTACK (BackendType): dstack Sky
|
|
12
12
|
GCP (BackendType): Google Cloud Platform
|
|
13
13
|
DATACRUNCH (BackendType): DataCrunch
|
|
14
|
+
HOTAISLE (BackendType): Hot Aisle
|
|
14
15
|
KUBERNETES (BackendType): Kubernetes
|
|
15
16
|
LAMBDA (BackendType): Lambda Cloud
|
|
16
17
|
NEBIUS (BackendType): Nebius AI Cloud
|
|
@@ -28,6 +29,7 @@ class BackendType(str, enum.Enum):
|
|
|
28
29
|
DATACRUNCH = "datacrunch"
|
|
29
30
|
DSTACK = "dstack"
|
|
30
31
|
GCP = "gcp"
|
|
32
|
+
HOTAISLE = "hotaisle"
|
|
31
33
|
KUBERNETES = "kubernetes"
|
|
32
34
|
LAMBDA = "lambda"
|
|
33
35
|
LOCAL = "local"
|
|
@@ -14,11 +14,12 @@ from dstack._internal.core.models.envs import Env
|
|
|
14
14
|
from dstack._internal.core.models.files import FilePathMapping
|
|
15
15
|
from dstack._internal.core.models.fleets import FleetConfiguration
|
|
16
16
|
from dstack._internal.core.models.gateways import GatewayConfiguration
|
|
17
|
-
from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
|
|
17
|
+
from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
|
|
18
18
|
from dstack._internal.core.models.resources import Range, ResourcesSpec
|
|
19
19
|
from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
|
|
20
20
|
from dstack._internal.core.models.unix import UnixUser
|
|
21
21
|
from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
|
|
22
|
+
from dstack._internal.utils.common import has_duplicates
|
|
22
23
|
from dstack._internal.utils.json_utils import (
|
|
23
24
|
pydantic_orjson_dumps_with_indent,
|
|
24
25
|
)
|
|
@@ -32,6 +33,14 @@ RUN_PRIOTIRY_MIN = 0
|
|
|
32
33
|
RUN_PRIOTIRY_MAX = 100
|
|
33
34
|
RUN_PRIORITY_DEFAULT = 0
|
|
34
35
|
DEFAULT_REPO_DIR = "/workflow"
|
|
36
|
+
MIN_PROBE_TIMEOUT = 1
|
|
37
|
+
MIN_PROBE_INTERVAL = 1
|
|
38
|
+
DEFAULT_PROBE_URL = "/"
|
|
39
|
+
DEFAULT_PROBE_TIMEOUT = 10
|
|
40
|
+
DEFAULT_PROBE_INTERVAL = 15
|
|
41
|
+
DEFAULT_PROBE_READY_AFTER = 1
|
|
42
|
+
DEFAULT_PROBE_METHOD = "get"
|
|
43
|
+
MAX_PROBE_URL_LEN = 2048
|
|
35
44
|
|
|
36
45
|
|
|
37
46
|
class RunConfigurationType(str, Enum):
|
|
@@ -162,6 +171,121 @@ class RateLimit(CoreModel):
|
|
|
162
171
|
] = 0
|
|
163
172
|
|
|
164
173
|
|
|
174
|
+
HTTPMethod = Literal["get", "post", "put", "delete", "patch", "head"]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class HTTPHeaderSpec(CoreModel):
|
|
178
|
+
name: Annotated[
|
|
179
|
+
str,
|
|
180
|
+
Field(
|
|
181
|
+
description="The name of the HTTP header",
|
|
182
|
+
min_length=1,
|
|
183
|
+
max_length=256,
|
|
184
|
+
),
|
|
185
|
+
]
|
|
186
|
+
value: Annotated[
|
|
187
|
+
str,
|
|
188
|
+
Field(
|
|
189
|
+
description="The value of the HTTP header",
|
|
190
|
+
min_length=1,
|
|
191
|
+
max_length=2048,
|
|
192
|
+
),
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class ProbeConfig(CoreModel):
|
|
197
|
+
type: Literal["http"] # expect other probe types in the future, namely `exec`
|
|
198
|
+
url: Annotated[
|
|
199
|
+
Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`")
|
|
200
|
+
] = None
|
|
201
|
+
method: Annotated[
|
|
202
|
+
Optional[HTTPMethod],
|
|
203
|
+
Field(
|
|
204
|
+
description=(
|
|
205
|
+
"The HTTP method to use for the probe (e.g., `get`, `post`, etc.)."
|
|
206
|
+
f" Defaults to `{DEFAULT_PROBE_METHOD}`"
|
|
207
|
+
)
|
|
208
|
+
),
|
|
209
|
+
] = None
|
|
210
|
+
headers: Annotated[
|
|
211
|
+
list[HTTPHeaderSpec],
|
|
212
|
+
Field(description="A list of HTTP headers to include in the request", max_items=16),
|
|
213
|
+
] = []
|
|
214
|
+
body: Annotated[
|
|
215
|
+
Optional[str],
|
|
216
|
+
Field(
|
|
217
|
+
description="The HTTP request body to send with the probe",
|
|
218
|
+
min_length=1,
|
|
219
|
+
max_length=2048,
|
|
220
|
+
),
|
|
221
|
+
] = None
|
|
222
|
+
timeout: Annotated[
|
|
223
|
+
Optional[Union[int, str]],
|
|
224
|
+
Field(
|
|
225
|
+
description=(
|
|
226
|
+
f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`"
|
|
227
|
+
)
|
|
228
|
+
),
|
|
229
|
+
] = None
|
|
230
|
+
interval: Annotated[
|
|
231
|
+
Optional[Union[int, str]],
|
|
232
|
+
Field(
|
|
233
|
+
description=(
|
|
234
|
+
"Minimum amount of time between the end of one probe execution"
|
|
235
|
+
f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`"
|
|
236
|
+
)
|
|
237
|
+
),
|
|
238
|
+
] = None
|
|
239
|
+
ready_after: Annotated[
|
|
240
|
+
Optional[int],
|
|
241
|
+
Field(
|
|
242
|
+
ge=1,
|
|
243
|
+
description=(
|
|
244
|
+
"The number of consecutive successful probe executions required for the replica"
|
|
245
|
+
" to be considered ready. Used during rolling deployments."
|
|
246
|
+
f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`"
|
|
247
|
+
),
|
|
248
|
+
),
|
|
249
|
+
] = None
|
|
250
|
+
|
|
251
|
+
@validator("timeout")
|
|
252
|
+
def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]:
|
|
253
|
+
if v is None:
|
|
254
|
+
return v
|
|
255
|
+
parsed = parse_duration(v)
|
|
256
|
+
if parsed < MIN_PROBE_TIMEOUT:
|
|
257
|
+
raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
|
|
258
|
+
return parsed
|
|
259
|
+
|
|
260
|
+
@validator("interval")
|
|
261
|
+
def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]:
|
|
262
|
+
if v is None:
|
|
263
|
+
return v
|
|
264
|
+
parsed = parse_duration(v)
|
|
265
|
+
if parsed < MIN_PROBE_INTERVAL:
|
|
266
|
+
raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
|
|
267
|
+
return parsed
|
|
268
|
+
|
|
269
|
+
@validator("url")
|
|
270
|
+
def validate_url(cls, v: Optional[str]) -> Optional[str]:
|
|
271
|
+
if v is None:
|
|
272
|
+
return v
|
|
273
|
+
if not v.startswith("/"):
|
|
274
|
+
raise ValueError("Must start with `/`")
|
|
275
|
+
if len(v) > MAX_PROBE_URL_LEN:
|
|
276
|
+
raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters")
|
|
277
|
+
if not v.isprintable():
|
|
278
|
+
raise ValueError("Cannot contain non-printable characters")
|
|
279
|
+
return v
|
|
280
|
+
|
|
281
|
+
@root_validator
|
|
282
|
+
def validate_body_matches_method(cls, values):
|
|
283
|
+
method: HTTPMethod = values["method"]
|
|
284
|
+
if values["body"] is not None and method in ["get", "head"]:
|
|
285
|
+
raise ValueError(f"Cannot set request body for the `{method}` method")
|
|
286
|
+
return values
|
|
287
|
+
|
|
288
|
+
|
|
165
289
|
class BaseRunConfiguration(CoreModel):
|
|
166
290
|
type: Literal["none"]
|
|
167
291
|
name: Annotated[
|
|
@@ -448,6 +572,10 @@ class ServiceConfigurationParams(CoreModel):
|
|
|
448
572
|
Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
|
|
449
573
|
] = None
|
|
450
574
|
rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
|
|
575
|
+
probes: Annotated[
|
|
576
|
+
list[ProbeConfig],
|
|
577
|
+
Field(description="List of probes used to determine job health"),
|
|
578
|
+
] = []
|
|
451
579
|
|
|
452
580
|
@validator("port")
|
|
453
581
|
def convert_port(cls, v) -> PortMapping:
|
|
@@ -511,6 +639,16 @@ class ServiceConfigurationParams(CoreModel):
|
|
|
511
639
|
)
|
|
512
640
|
return v
|
|
513
641
|
|
|
642
|
+
@validator("probes")
|
|
643
|
+
def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
|
|
644
|
+
if has_duplicates(v):
|
|
645
|
+
# Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
|
|
646
|
+
# https://github.com/pydantic/pydantic/issues/3765
|
|
647
|
+
# Because of the bug, our gen_schema_reference.py fails to determine the type of
|
|
648
|
+
# ServiceConfiguration.probes and insert the correct hyperlink.
|
|
649
|
+
raise ValueError("Probes must be unique")
|
|
650
|
+
return v
|
|
651
|
+
|
|
514
652
|
|
|
515
653
|
class ServiceConfiguration(
|
|
516
654
|
ProfileParams, BaseRunConfigurationWithCommands, ServiceConfigurationParams
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from dstack._internal.core.models.common import CoreModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HealthStatus(str, Enum):
|
|
8
|
+
HEALTHY = "healthy"
|
|
9
|
+
WARNING = "warning"
|
|
10
|
+
FAILURE = "failure"
|
|
11
|
+
|
|
12
|
+
def is_healthy(self) -> bool:
|
|
13
|
+
return self == self.HEALTHY
|
|
14
|
+
|
|
15
|
+
def is_failure(self) -> bool:
|
|
16
|
+
return self == self.FAILURE
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HealthEvent(CoreModel):
|
|
20
|
+
timestamp: datetime
|
|
21
|
+
status: HealthStatus
|
|
22
|
+
message: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class HealthCheck(CoreModel):
|
|
26
|
+
collected_at: datetime
|
|
27
|
+
status: HealthStatus
|
|
28
|
+
events: list[HealthEvent]
|
|
@@ -9,6 +9,7 @@ from pydantic import root_validator
|
|
|
9
9
|
from dstack._internal.core.models.backends.base import BackendType
|
|
10
10
|
from dstack._internal.core.models.common import CoreModel
|
|
11
11
|
from dstack._internal.core.models.envs import Env
|
|
12
|
+
from dstack._internal.core.models.health import HealthStatus
|
|
12
13
|
from dstack._internal.core.models.volumes import Volume
|
|
13
14
|
from dstack._internal.utils.common import pretty_resources
|
|
14
15
|
|
|
@@ -225,6 +226,7 @@ class Instance(CoreModel):
|
|
|
225
226
|
hostname: Optional[str] = None
|
|
226
227
|
status: InstanceStatus
|
|
227
228
|
unreachable: bool = False
|
|
229
|
+
health_status: HealthStatus = HealthStatus.HEALTHY
|
|
228
230
|
termination_reason: Optional[str] = None
|
|
229
231
|
created: datetime.datetime
|
|
230
232
|
region: Optional[str] = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datetime import datetime, timedelta
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Any, Dict, List, Optional, Type
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Type
|
|
4
4
|
|
|
5
5
|
from pydantic import UUID4, Field, root_validator
|
|
6
6
|
from typing_extensions import Annotated
|
|
@@ -8,8 +8,11 @@ from typing_extensions import Annotated
|
|
|
8
8
|
from dstack._internal.core.models.backends.base import BackendType
|
|
9
9
|
from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
|
|
10
10
|
from dstack._internal.core.models.configurations import (
|
|
11
|
+
DEFAULT_PROBE_METHOD,
|
|
11
12
|
DEFAULT_REPO_DIR,
|
|
12
13
|
AnyRunConfiguration,
|
|
14
|
+
HTTPHeaderSpec,
|
|
15
|
+
HTTPMethod,
|
|
13
16
|
RunConfiguration,
|
|
14
17
|
ServiceConfiguration,
|
|
15
18
|
)
|
|
@@ -223,6 +226,17 @@ class JobSSHKey(CoreModel):
|
|
|
223
226
|
public: str
|
|
224
227
|
|
|
225
228
|
|
|
229
|
+
class ProbeSpec(CoreModel):
|
|
230
|
+
type: Literal["http"] # expect other probe types in the future, namely `exec`
|
|
231
|
+
url: str
|
|
232
|
+
method: HTTPMethod = DEFAULT_PROBE_METHOD
|
|
233
|
+
headers: list[HTTPHeaderSpec] = []
|
|
234
|
+
body: Optional[str] = None
|
|
235
|
+
timeout: int
|
|
236
|
+
interval: int
|
|
237
|
+
ready_after: int
|
|
238
|
+
|
|
239
|
+
|
|
226
240
|
class JobSpec(CoreModel):
|
|
227
241
|
replica_num: int = 0 # default value for backward compatibility
|
|
228
242
|
job_num: int
|
|
@@ -256,6 +270,7 @@ class JobSpec(CoreModel):
|
|
|
256
270
|
file_archives: list[FileArchiveMapping] = []
|
|
257
271
|
# None for non-services and pre-0.19.19 services. See `get_service_port`
|
|
258
272
|
service_port: Optional[int] = None
|
|
273
|
+
probes: list[ProbeSpec] = []
|
|
259
274
|
|
|
260
275
|
|
|
261
276
|
class JobProvisioningData(CoreModel):
|
|
@@ -325,6 +340,10 @@ class ClusterInfo(CoreModel):
|
|
|
325
340
|
gpus_per_job: int
|
|
326
341
|
|
|
327
342
|
|
|
343
|
+
class Probe(CoreModel):
|
|
344
|
+
success_streak: int
|
|
345
|
+
|
|
346
|
+
|
|
328
347
|
class JobSubmission(CoreModel):
|
|
329
348
|
id: UUID4
|
|
330
349
|
submission_num: int
|
|
@@ -341,6 +360,7 @@ class JobSubmission(CoreModel):
|
|
|
341
360
|
job_provisioning_data: Optional[JobProvisioningData]
|
|
342
361
|
job_runtime_data: Optional[JobRuntimeData]
|
|
343
362
|
error: Optional[str] = None
|
|
363
|
+
probes: list[Probe] = []
|
|
344
364
|
|
|
345
365
|
@property
|
|
346
366
|
def age(self) -> timedelta:
|
|
@@ -236,6 +236,13 @@ class SSHTunnel:
|
|
|
236
236
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
237
237
|
self.close()
|
|
238
238
|
|
|
239
|
+
async def __aenter__(self):
|
|
240
|
+
await self.aopen()
|
|
241
|
+
return self
|
|
242
|
+
|
|
243
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
244
|
+
await self.aclose()
|
|
245
|
+
|
|
239
246
|
def _get_proxy_command(self) -> Optional[str]:
|
|
240
247
|
proxy_command: Optional[str] = None
|
|
241
248
|
for params, identity_path in self.ssh_proxies:
|
dstack/_internal/server/app.py
CHANGED
|
@@ -22,6 +22,7 @@ from dstack._internal.proxy.lib.deps import get_injector_from_app
|
|
|
22
22
|
from dstack._internal.proxy.lib.routers import model_proxy
|
|
23
23
|
from dstack._internal.server import settings
|
|
24
24
|
from dstack._internal.server.background import start_background_tasks
|
|
25
|
+
from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER
|
|
25
26
|
from dstack._internal.server.db import get_db, get_session_ctx, migrate
|
|
26
27
|
from dstack._internal.server.routers import (
|
|
27
28
|
backends,
|
|
@@ -155,6 +156,7 @@ async def lifespan(app: FastAPI):
|
|
|
155
156
|
scheduler = start_background_tasks()
|
|
156
157
|
else:
|
|
157
158
|
logger.info("Background processing is disabled")
|
|
159
|
+
PROBES_SCHEDULER.start()
|
|
158
160
|
dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
|
|
159
161
|
logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
|
|
160
162
|
logger.info(
|
|
@@ -166,6 +168,7 @@ async def lifespan(app: FastAPI):
|
|
|
166
168
|
yield
|
|
167
169
|
if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
|
|
168
170
|
scheduler.shutdown()
|
|
171
|
+
PROBES_SCHEDULER.shutdown(wait=False)
|
|
169
172
|
await gateway_connections_pool.remove_all()
|
|
170
173
|
service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
|
|
171
174
|
await service_conn_pool.remove_all()
|
|
@@ -197,6 +200,7 @@ def register_routes(app: FastAPI, ui: bool = True):
|
|
|
197
200
|
app.include_router(fleets.root_router)
|
|
198
201
|
app.include_router(fleets.project_router)
|
|
199
202
|
app.include_router(instances.root_router)
|
|
203
|
+
app.include_router(instances.project_router)
|
|
200
204
|
app.include_router(repos.router)
|
|
201
205
|
app.include_router(runs.root_router)
|
|
202
206
|
app.include_router(runs.project_router)
|
|
@@ -9,6 +9,7 @@ from dstack._internal.server.background.tasks.process_gateways import (
|
|
|
9
9
|
)
|
|
10
10
|
from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
|
|
11
11
|
from dstack._internal.server.background.tasks.process_instances import (
|
|
12
|
+
delete_instance_health_checks,
|
|
12
13
|
process_instances,
|
|
13
14
|
)
|
|
14
15
|
from dstack._internal.server.background.tasks.process_metrics import (
|
|
@@ -18,6 +19,7 @@ from dstack._internal.server.background.tasks.process_metrics import (
|
|
|
18
19
|
from dstack._internal.server.background.tasks.process_placement_groups import (
|
|
19
20
|
process_placement_groups,
|
|
20
21
|
)
|
|
22
|
+
from dstack._internal.server.background.tasks.process_probes import process_probes
|
|
21
23
|
from dstack._internal.server.background.tasks.process_prometheus_metrics import (
|
|
22
24
|
collect_prometheus_metrics,
|
|
23
25
|
delete_prometheus_metrics,
|
|
@@ -63,6 +65,7 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
63
65
|
# that the first waiting for the lock will acquire it.
|
|
64
66
|
# The jitter is needed to give all tasks a chance to acquire locks.
|
|
65
67
|
|
|
68
|
+
_scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1))
|
|
66
69
|
_scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
|
|
67
70
|
_scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
|
|
68
71
|
if settings.ENABLE_PROMETHEUS_METRICS:
|
|
@@ -84,6 +87,7 @@ def start_background_tasks() -> AsyncIOScheduler:
|
|
|
84
87
|
IntervalTrigger(seconds=10, jitter=2),
|
|
85
88
|
max_instances=1,
|
|
86
89
|
)
|
|
90
|
+
_scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1)
|
|
87
91
|
for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
|
|
88
92
|
# Add multiple copies of tasks if requested.
|
|
89
93
|
# max_instances=1 for additional copies to avoid running too many tasks.
|