dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +8 -3
- dstack/_internal/cli/services/configurators/__init__.py +8 -0
- dstack/_internal/cli/services/configurators/fleet.py +1 -1
- dstack/_internal/cli/services/configurators/gateway.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +11 -1
- dstack/_internal/cli/services/configurators/volume.py +1 -1
- dstack/_internal/cli/utils/common.py +48 -5
- dstack/_internal/cli/utils/fleet.py +5 -5
- dstack/_internal/cli/utils/run.py +32 -0
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +225 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +12 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +139 -1
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +2 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/core/models/runs.py +21 -1
- dstack/_internal/core/services/ssh/tunnel.py +7 -0
- dstack/_internal/server/app.py +26 -10
- dstack/_internal/server/background/__init__.py +9 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +168 -103
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
- dstack/_internal/server/background/tasks/process_runs.py +84 -34
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +57 -16
- dstack/_internal/server/routers/instances.py +33 -5
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +32 -0
- dstack/_internal/server/schemas/runner.py +5 -0
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +113 -15
- dstack/_internal/server/services/jobs/__init__.py +18 -13
- dstack/_internal/server/services/jobs/configurators/base.py +26 -0
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/aws.py +13 -1
- dstack/_internal/server/services/logs/gcp.py +16 -1
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/probes.py +6 -0
- dstack/_internal/server/services/projects.py +51 -19
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
- dstack/_internal/server/services/runner/client.py +52 -20
- dstack/_internal/server/services/runner/ssh.py +4 -4
- dstack/_internal/server/services/runs.py +115 -39
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/ssh.py +66 -0
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +16 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
- dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
- dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
- dstack/_internal/server/testing/common.py +51 -0
- dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +15 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/api/server/__init__.py +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
- /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from enum import IntEnum
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.common import CoreModel
|
|
4
|
+
from dstack._internal.core.models.health import HealthStatus
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DCGMHealthResult(IntEnum):
|
|
8
|
+
"""
|
|
9
|
+
`dcgmHealthWatchResult_enum`
|
|
10
|
+
|
|
11
|
+
See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/const.go#L1020-L1026
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
DCGM_HEALTH_RESULT_PASS = 0
|
|
15
|
+
DCGM_HEALTH_RESULT_WARN = 10
|
|
16
|
+
DCGM_HEALTH_RESULT_FAIL = 20
|
|
17
|
+
|
|
18
|
+
def to_health_status(self) -> HealthStatus:
|
|
19
|
+
if self == self.DCGM_HEALTH_RESULT_PASS:
|
|
20
|
+
return HealthStatus.HEALTHY
|
|
21
|
+
if self == self.DCGM_HEALTH_RESULT_WARN:
|
|
22
|
+
return HealthStatus.WARNING
|
|
23
|
+
if self == self.DCGM_HEALTH_RESULT_FAIL:
|
|
24
|
+
return HealthStatus.FAILURE
|
|
25
|
+
raise AssertionError("should not reach here")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DCGMHealthIncident(CoreModel):
|
|
29
|
+
"""
|
|
30
|
+
Flattened `dcgmIncidentInfo_t`
|
|
31
|
+
|
|
32
|
+
See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L68-L73
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# dcgmIncidentInfo_t
|
|
36
|
+
system: int
|
|
37
|
+
health: DCGMHealthResult
|
|
38
|
+
|
|
39
|
+
# dcgmDiagErrorDetail_t
|
|
40
|
+
error_message: str
|
|
41
|
+
error_code: int
|
|
42
|
+
|
|
43
|
+
# dcgmGroupEntityPair_t
|
|
44
|
+
entity_group_id: int # dcgmGroupEntityPair_t
|
|
45
|
+
entity_id: int
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DCGMHealthResponse(CoreModel):
|
|
49
|
+
"""
|
|
50
|
+
`dcgmHealthResponse_v5`
|
|
51
|
+
|
|
52
|
+
See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L75-L78
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
overall_health: DCGMHealthResult
|
|
56
|
+
incidents: list[DCGMHealthIncident]
|
|
@@ -3,6 +3,8 @@ from typing import Optional
|
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
from dstack._internal.core.models.health import HealthCheck, HealthStatus
|
|
7
|
+
from dstack._internal.server.schemas.runner import InstanceHealthResponse
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class ListInstancesRequest(CoreModel):
|
|
@@ -13,3 +15,33 @@ class ListInstancesRequest(CoreModel):
|
|
|
13
15
|
prev_id: Optional[UUID] = None
|
|
14
16
|
limit: int = 1000
|
|
15
17
|
ascending: bool = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class InstanceCheck(CoreModel):
|
|
21
|
+
reachable: bool
|
|
22
|
+
message: Optional[str] = None
|
|
23
|
+
health_response: Optional[InstanceHealthResponse] = None
|
|
24
|
+
|
|
25
|
+
def get_health_status(self) -> HealthStatus:
|
|
26
|
+
if self.health_response is None:
|
|
27
|
+
return HealthStatus.HEALTHY
|
|
28
|
+
if self.health_response.dcgm is None:
|
|
29
|
+
return HealthStatus.HEALTHY
|
|
30
|
+
return self.health_response.dcgm.overall_health.to_health_status()
|
|
31
|
+
|
|
32
|
+
def has_health_checks(self) -> bool:
|
|
33
|
+
if self.health_response is None:
|
|
34
|
+
return False
|
|
35
|
+
return self.health_response.dcgm is not None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GetInstanceHealthChecksRequest(CoreModel):
|
|
39
|
+
fleet_name: str
|
|
40
|
+
instance_num: int
|
|
41
|
+
after: Optional[datetime] = None
|
|
42
|
+
before: Optional[datetime] = None
|
|
43
|
+
limit: Optional[int] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class GetInstanceHealthChecksResponse(CoreModel):
|
|
47
|
+
health_checks: list[HealthCheck]
|
|
@@ -16,6 +16,7 @@ from dstack._internal.core.models.runs import (
|
|
|
16
16
|
RunSpec,
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
|
|
19
|
+
from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class JobStateEvent(CoreModel):
|
|
@@ -114,6 +115,10 @@ class HealthcheckResponse(CoreModel):
|
|
|
114
115
|
version: str
|
|
115
116
|
|
|
116
117
|
|
|
118
|
+
class InstanceHealthResponse(CoreModel):
|
|
119
|
+
dcgm: Optional[DCGMHealthResponse] = None
|
|
120
|
+
|
|
121
|
+
|
|
117
122
|
class GPUMetrics(CoreModel):
|
|
118
123
|
gpu_memory_usage_bytes: int
|
|
119
124
|
gpu_util_percent: int
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from collections.abc import Callable
|
|
3
|
-
from datetime import datetime
|
|
3
|
+
from datetime import datetime
|
|
4
4
|
from functools import wraps
|
|
5
5
|
from typing import List, Literal, Optional, Tuple, TypeVar, Union, cast
|
|
6
6
|
|
|
@@ -8,8 +8,8 @@ from sqlalchemy import and_, func, or_, select
|
|
|
8
8
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
9
|
from sqlalchemy.orm import joinedload, selectinload
|
|
10
10
|
|
|
11
|
-
from dstack._internal.core.backends import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
|
|
12
11
|
from dstack._internal.core.backends.base.backend import Backend
|
|
12
|
+
from dstack._internal.core.backends.features import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
|
|
13
13
|
from dstack._internal.core.errors import (
|
|
14
14
|
ForbiddenError,
|
|
15
15
|
ResourceExistsError,
|
|
@@ -49,6 +49,7 @@ from dstack._internal.server.db import get_db
|
|
|
49
49
|
from dstack._internal.server.models import (
|
|
50
50
|
FleetModel,
|
|
51
51
|
InstanceModel,
|
|
52
|
+
JobModel,
|
|
52
53
|
ProjectModel,
|
|
53
54
|
UserModel,
|
|
54
55
|
)
|
|
@@ -66,7 +67,6 @@ from dstack._internal.server.services.plugins import apply_plugin_policies
|
|
|
66
67
|
from dstack._internal.server.services.projects import (
|
|
67
68
|
get_member,
|
|
68
69
|
get_member_permissions,
|
|
69
|
-
list_project_models,
|
|
70
70
|
list_user_project_models,
|
|
71
71
|
)
|
|
72
72
|
from dstack._internal.server.services.resources import set_resources_defaults
|
|
@@ -87,10 +87,11 @@ async def list_fleets(
|
|
|
87
87
|
limit: int,
|
|
88
88
|
ascending: bool,
|
|
89
89
|
) -> List[Fleet]:
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
90
|
+
projects = await list_user_project_models(
|
|
91
|
+
session=session,
|
|
92
|
+
user=user,
|
|
93
|
+
only_names=True,
|
|
94
|
+
)
|
|
94
95
|
if project_name is not None:
|
|
95
96
|
projects = [p for p in projects if p.name == project_name]
|
|
96
97
|
fleet_models = await list_projects_fleet_models(
|
|
@@ -398,7 +399,11 @@ async def apply_plan(
|
|
|
398
399
|
FleetModel.id == fleet_model.id,
|
|
399
400
|
FleetModel.deleted == False,
|
|
400
401
|
)
|
|
401
|
-
.options(
|
|
402
|
+
.options(
|
|
403
|
+
selectinload(FleetModel.instances)
|
|
404
|
+
.joinedload(InstanceModel.jobs)
|
|
405
|
+
.load_only(JobModel.id)
|
|
406
|
+
)
|
|
402
407
|
.options(selectinload(FleetModel.runs))
|
|
403
408
|
.execution_options(populate_existing=True)
|
|
404
409
|
.order_by(FleetModel.id) # take locks in order
|
|
@@ -563,7 +568,11 @@ async def delete_fleets(
|
|
|
563
568
|
FleetModel.name.in_(names),
|
|
564
569
|
FleetModel.deleted == False,
|
|
565
570
|
)
|
|
566
|
-
.options(
|
|
571
|
+
.options(
|
|
572
|
+
selectinload(FleetModel.instances)
|
|
573
|
+
.joinedload(InstanceModel.jobs)
|
|
574
|
+
.load_only(JobModel.id)
|
|
575
|
+
)
|
|
567
576
|
.options(selectinload(FleetModel.runs))
|
|
568
577
|
.execution_options(populate_existing=True)
|
|
569
578
|
.order_by(FleetModel.id) # take locks in order
|
|
@@ -600,7 +609,7 @@ def fleet_model_to_fleet(
|
|
|
600
609
|
name=fleet_model.name,
|
|
601
610
|
project_name=fleet_model.project.name,
|
|
602
611
|
spec=spec,
|
|
603
|
-
created_at=fleet_model.created_at
|
|
612
|
+
created_at=fleet_model.created_at,
|
|
604
613
|
status=fleet_model.status,
|
|
605
614
|
status_message=fleet_model.status_message,
|
|
606
615
|
instances=instances,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import datetime
|
|
3
3
|
import uuid
|
|
4
|
-
from datetime import timedelta
|
|
4
|
+
from datetime import timedelta
|
|
5
5
|
from functools import partial
|
|
6
6
|
from typing import List, Optional, Sequence
|
|
7
7
|
|
|
@@ -11,16 +11,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
11
11
|
from sqlalchemy.orm import selectinload
|
|
12
12
|
|
|
13
13
|
import dstack._internal.utils.random_names as random_names
|
|
14
|
-
from dstack._internal.core.backends import (
|
|
15
|
-
BACKENDS_WITH_GATEWAY_SUPPORT,
|
|
16
|
-
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
|
|
17
|
-
)
|
|
18
14
|
from dstack._internal.core.backends.base.compute import (
|
|
19
15
|
Compute,
|
|
20
16
|
ComputeWithGatewaySupport,
|
|
21
17
|
get_dstack_gateway_wheel,
|
|
22
18
|
get_dstack_runner_version,
|
|
23
19
|
)
|
|
20
|
+
from dstack._internal.core.backends.features import (
|
|
21
|
+
BACKENDS_WITH_GATEWAY_SUPPORT,
|
|
22
|
+
BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
|
|
23
|
+
)
|
|
24
24
|
from dstack._internal.core.errors import (
|
|
25
25
|
GatewayError,
|
|
26
26
|
ResourceNotExistsError,
|
|
@@ -86,15 +86,6 @@ async def get_gateway_by_name(
|
|
|
86
86
|
return gateway_model_to_gateway(gateway)
|
|
87
87
|
|
|
88
88
|
|
|
89
|
-
async def get_project_default_gateway(
|
|
90
|
-
session: AsyncSession, project: ProjectModel
|
|
91
|
-
) -> Optional[Gateway]:
|
|
92
|
-
gateway: Optional[GatewayModel] = project.default_gateway
|
|
93
|
-
if gateway is None:
|
|
94
|
-
return None
|
|
95
|
-
return gateway_model_to_gateway(gateway)
|
|
96
|
-
|
|
97
|
-
|
|
98
89
|
async def create_gateway_compute(
|
|
99
90
|
project_name: str,
|
|
100
91
|
backend_compute: Compute,
|
|
@@ -181,9 +172,9 @@ async def create_gateway(
|
|
|
181
172
|
session.add(gateway)
|
|
182
173
|
await session.commit()
|
|
183
174
|
|
|
184
|
-
|
|
175
|
+
default_gateway = await get_project_default_gateway_model(session=session, project=project)
|
|
176
|
+
if default_gateway is None or configuration.default:
|
|
185
177
|
await set_default_gateway(session=session, project=project, name=configuration.name)
|
|
186
|
-
|
|
187
178
|
return gateway_model_to_gateway(gateway)
|
|
188
179
|
|
|
189
180
|
|
|
@@ -349,6 +340,15 @@ async def get_project_gateway_model_by_name(
|
|
|
349
340
|
return res.scalar()
|
|
350
341
|
|
|
351
342
|
|
|
343
|
+
async def get_project_default_gateway_model(
|
|
344
|
+
session: AsyncSession, project: ProjectModel
|
|
345
|
+
) -> Optional[GatewayModel]:
|
|
346
|
+
res = await session.execute(
|
|
347
|
+
select(GatewayModel).where(GatewayModel.id == project.default_gateway_id)
|
|
348
|
+
)
|
|
349
|
+
return res.scalar_one_or_none()
|
|
350
|
+
|
|
351
|
+
|
|
352
352
|
async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str:
|
|
353
353
|
gateways = await list_project_gateway_models(session=session, project=project)
|
|
354
354
|
names = {g.name for g in gateways}
|
|
@@ -557,7 +557,7 @@ def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway:
|
|
|
557
557
|
region=gateway_model.region,
|
|
558
558
|
wildcard_domain=gateway_model.wildcard_domain,
|
|
559
559
|
default=gateway_model.project.default_gateway_id == gateway_model.id,
|
|
560
|
-
created_at=gateway_model.created_at
|
|
560
|
+
created_at=gateway_model.created_at,
|
|
561
561
|
status=gateway_model.status,
|
|
562
562
|
status_message=gateway_model.status_message,
|
|
563
563
|
configuration=configuration,
|
|
@@ -1,20 +1,23 @@
|
|
|
1
|
+
import operator
|
|
1
2
|
import uuid
|
|
2
3
|
from collections.abc import Container, Iterable
|
|
3
|
-
from datetime import datetime
|
|
4
|
+
from datetime import datetime
|
|
4
5
|
from typing import Dict, List, Literal, Optional, Union
|
|
5
6
|
|
|
6
7
|
import gpuhunt
|
|
7
8
|
from sqlalchemy import and_, or_, select
|
|
8
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
-
from sqlalchemy.orm import joinedload
|
|
10
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
10
11
|
|
|
11
|
-
from dstack._internal.core.backends import BACKENDS_WITH_MULTINODE_SUPPORT
|
|
12
12
|
from dstack._internal.core.backends.base.offers import (
|
|
13
13
|
offer_to_catalog_item,
|
|
14
14
|
requirements_to_query_filter,
|
|
15
15
|
)
|
|
16
|
+
from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
|
|
17
|
+
from dstack._internal.core.errors import ResourceNotExistsError
|
|
16
18
|
from dstack._internal.core.models.backends.base import BackendType
|
|
17
19
|
from dstack._internal.core.models.envs import Env
|
|
20
|
+
from dstack._internal.core.models.health import HealthCheck, HealthEvent, HealthStatus
|
|
18
21
|
from dstack._internal.core.models.instances import (
|
|
19
22
|
Instance,
|
|
20
23
|
InstanceAvailability,
|
|
@@ -34,23 +37,76 @@ from dstack._internal.core.models.profiles import (
|
|
|
34
37
|
TerminationPolicy,
|
|
35
38
|
)
|
|
36
39
|
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
37
|
-
from dstack._internal.core.models.users import GlobalRole
|
|
38
40
|
from dstack._internal.core.models.volumes import Volume
|
|
39
41
|
from dstack._internal.core.services.profiles import get_termination
|
|
40
42
|
from dstack._internal.server.models import (
|
|
41
43
|
FleetModel,
|
|
44
|
+
InstanceHealthCheckModel,
|
|
42
45
|
InstanceModel,
|
|
43
46
|
ProjectModel,
|
|
44
47
|
UserModel,
|
|
45
48
|
)
|
|
49
|
+
from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
|
|
50
|
+
from dstack._internal.server.schemas.runner import InstanceHealthResponse
|
|
46
51
|
from dstack._internal.server.services.offers import generate_shared_offer
|
|
47
|
-
from dstack._internal.server.services.projects import
|
|
52
|
+
from dstack._internal.server.services.projects import list_user_project_models
|
|
48
53
|
from dstack._internal.utils import common as common_utils
|
|
49
54
|
from dstack._internal.utils.logging import get_logger
|
|
50
55
|
|
|
51
56
|
logger = get_logger(__name__)
|
|
52
57
|
|
|
53
58
|
|
|
59
|
+
async def get_instance_health_checks(
|
|
60
|
+
session: AsyncSession,
|
|
61
|
+
project: ProjectModel,
|
|
62
|
+
fleet_name: str,
|
|
63
|
+
instance_num: int,
|
|
64
|
+
after: Optional[datetime] = None,
|
|
65
|
+
before: Optional[datetime] = None,
|
|
66
|
+
limit: Optional[int] = None,
|
|
67
|
+
) -> list[HealthCheck]:
|
|
68
|
+
"""
|
|
69
|
+
Returns instance health checks ordered from the latest to the earliest.
|
|
70
|
+
|
|
71
|
+
Expected usage:
|
|
72
|
+
* limit=100 — get the latest 100 checks
|
|
73
|
+
* after=<now - 1 hour> — get checks for the last hour
|
|
74
|
+
* before=<earliest timestamp from the last batch>, limit=100 — paginate back in history
|
|
75
|
+
"""
|
|
76
|
+
res = await session.execute(
|
|
77
|
+
select(InstanceModel)
|
|
78
|
+
.join(FleetModel)
|
|
79
|
+
.where(
|
|
80
|
+
~InstanceModel.deleted,
|
|
81
|
+
InstanceModel.project_id == project.id,
|
|
82
|
+
InstanceModel.instance_num == instance_num,
|
|
83
|
+
FleetModel.name == fleet_name,
|
|
84
|
+
)
|
|
85
|
+
.options(load_only(InstanceModel.id))
|
|
86
|
+
)
|
|
87
|
+
instance = res.scalar_one_or_none()
|
|
88
|
+
if instance is None:
|
|
89
|
+
raise ResourceNotExistsError()
|
|
90
|
+
|
|
91
|
+
stmt = (
|
|
92
|
+
select(InstanceHealthCheckModel)
|
|
93
|
+
.where(InstanceHealthCheckModel.instance_id == instance.id)
|
|
94
|
+
.order_by(InstanceHealthCheckModel.collected_at.desc())
|
|
95
|
+
)
|
|
96
|
+
if after is not None:
|
|
97
|
+
stmt = stmt.where(InstanceHealthCheckModel.collected_at > after)
|
|
98
|
+
if before is not None:
|
|
99
|
+
stmt = stmt.where(InstanceHealthCheckModel.collected_at < before)
|
|
100
|
+
if limit is not None:
|
|
101
|
+
stmt = stmt.limit(limit)
|
|
102
|
+
health_checks: list[HealthCheck] = []
|
|
103
|
+
res = await session.execute(stmt)
|
|
104
|
+
for health_check_model in res.scalars():
|
|
105
|
+
health_check = instance_health_check_model_to_health_check(health_check_model)
|
|
106
|
+
health_checks.append(health_check)
|
|
107
|
+
return health_checks
|
|
108
|
+
|
|
109
|
+
|
|
54
110
|
def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
55
111
|
instance = Instance(
|
|
56
112
|
id=instance_model.id,
|
|
@@ -61,8 +117,9 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
|
61
117
|
instance_num=instance_model.instance_num,
|
|
62
118
|
status=instance_model.status,
|
|
63
119
|
unreachable=instance_model.unreachable,
|
|
120
|
+
health_status=instance_model.health,
|
|
64
121
|
termination_reason=instance_model.termination_reason,
|
|
65
|
-
created=instance_model.created_at
|
|
122
|
+
created=instance_model.created_at,
|
|
66
123
|
total_blocks=instance_model.total_blocks,
|
|
67
124
|
busy_blocks=instance_model.busy_blocks,
|
|
68
125
|
)
|
|
@@ -82,6 +139,48 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
|
82
139
|
return instance
|
|
83
140
|
|
|
84
141
|
|
|
142
|
+
def instance_health_check_model_to_health_check(model: InstanceHealthCheckModel) -> HealthCheck:
|
|
143
|
+
collected_at = model.collected_at
|
|
144
|
+
status = HealthStatus.HEALTHY
|
|
145
|
+
events: list[HealthEvent] = []
|
|
146
|
+
instance_health_response = get_instance_health_response(model)
|
|
147
|
+
if (dcgm := instance_health_response.dcgm) is not None:
|
|
148
|
+
dcgm_health_check = dcgm_health_response_to_health_check(dcgm, collected_at)
|
|
149
|
+
status = dcgm_health_check.status
|
|
150
|
+
events.extend(dcgm_health_check.events)
|
|
151
|
+
events.sort(key=operator.attrgetter("timestamp"), reverse=True)
|
|
152
|
+
return HealthCheck(
|
|
153
|
+
collected_at=collected_at,
|
|
154
|
+
status=status,
|
|
155
|
+
events=events,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def dcgm_health_response_to_health_check(
|
|
160
|
+
response: DCGMHealthResponse, collected_at: datetime
|
|
161
|
+
) -> HealthCheck:
|
|
162
|
+
events: list[HealthEvent] = []
|
|
163
|
+
for incident in response.incidents:
|
|
164
|
+
events.append(
|
|
165
|
+
HealthEvent(
|
|
166
|
+
timestamp=collected_at,
|
|
167
|
+
status=incident.health.to_health_status(),
|
|
168
|
+
message=incident.error_message,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
return HealthCheck(
|
|
172
|
+
collected_at=collected_at,
|
|
173
|
+
status=response.overall_health.to_health_status(),
|
|
174
|
+
events=events,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_instance_health_response(
|
|
179
|
+
instance_health_check_model: InstanceHealthCheckModel,
|
|
180
|
+
) -> InstanceHealthResponse:
|
|
181
|
+
return InstanceHealthResponse.__response__.parse_raw(instance_health_check_model.response)
|
|
182
|
+
|
|
183
|
+
|
|
85
184
|
def get_instance_provisioning_data(instance_model: InstanceModel) -> Optional[JobProvisioningData]:
|
|
86
185
|
if instance_model.job_provisioning_data is None:
|
|
87
186
|
return None
|
|
@@ -195,6 +294,8 @@ def filter_pool_instances(
|
|
|
195
294
|
continue
|
|
196
295
|
if instance.unreachable:
|
|
197
296
|
continue
|
|
297
|
+
if instance.health.is_failure():
|
|
298
|
+
continue
|
|
198
299
|
fleet = instance.fleet
|
|
199
300
|
if profile.fleets is not None and (fleet is None or fleet.name not in profile.fleets):
|
|
200
301
|
continue
|
|
@@ -372,18 +473,15 @@ async def list_user_instances(
|
|
|
372
473
|
limit: int,
|
|
373
474
|
ascending: bool,
|
|
374
475
|
) -> List[Instance]:
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
return []
|
|
381
|
-
|
|
476
|
+
projects = await list_user_project_models(
|
|
477
|
+
session=session,
|
|
478
|
+
user=user,
|
|
479
|
+
only_names=True,
|
|
480
|
+
)
|
|
382
481
|
if project_names is not None:
|
|
383
|
-
projects = [
|
|
482
|
+
projects = [p for p in projects if p.name in project_names]
|
|
384
483
|
if len(projects) == 0:
|
|
385
484
|
return []
|
|
386
|
-
|
|
387
485
|
instance_models = await list_projects_instance_models(
|
|
388
486
|
session=session,
|
|
389
487
|
projects=projects,
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
|
-
from datetime import timedelta
|
|
3
|
+
from datetime import timedelta
|
|
4
4
|
from typing import Dict, Iterable, List, Optional, Tuple
|
|
5
5
|
from uuid import UUID
|
|
6
6
|
|
|
7
7
|
import requests
|
|
8
8
|
from sqlalchemy import select
|
|
9
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
10
|
-
from sqlalchemy.orm import joinedload
|
|
10
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
11
11
|
|
|
12
12
|
import dstack._internal.server.services.backends as backends_services
|
|
13
13
|
from dstack._internal.core.backends.base.backend import Backend
|
|
@@ -52,6 +52,7 @@ from dstack._internal.server.services.jobs.configurators.dev import DevEnvironme
|
|
|
52
52
|
from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator
|
|
53
53
|
from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator
|
|
54
54
|
from dstack._internal.server.services.logging import fmt
|
|
55
|
+
from dstack._internal.server.services.probes import probe_model_to_probe
|
|
55
56
|
from dstack._internal.server.services.runner import client
|
|
56
57
|
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
57
58
|
from dstack._internal.server.services.volumes import (
|
|
@@ -115,7 +116,9 @@ async def get_run_job_model(
|
|
|
115
116
|
return res.scalar_one_or_none()
|
|
116
117
|
|
|
117
118
|
|
|
118
|
-
def job_model_to_job_submission(
|
|
119
|
+
def job_model_to_job_submission(
|
|
120
|
+
job_model: JobModel, include_probes: bool = False
|
|
121
|
+
) -> JobSubmission:
|
|
119
122
|
job_provisioning_data = get_job_provisioning_data(job_model)
|
|
120
123
|
if job_provisioning_data is not None:
|
|
121
124
|
# TODO remove after transitioning to computed fields
|
|
@@ -130,17 +133,20 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
130
133
|
):
|
|
131
134
|
backend_data = json.loads(job_provisioning_data.backend_data)
|
|
132
135
|
job_provisioning_data.backend = backend_data["base_backend"]
|
|
133
|
-
last_processed_at = job_model.last_processed_at
|
|
136
|
+
last_processed_at = job_model.last_processed_at
|
|
134
137
|
finished_at = None
|
|
135
138
|
if job_model.status.is_finished():
|
|
136
139
|
finished_at = last_processed_at
|
|
137
140
|
status_message = _get_job_status_message(job_model)
|
|
138
141
|
error = _get_job_error(job_model)
|
|
142
|
+
probes = []
|
|
143
|
+
if include_probes:
|
|
144
|
+
probes = [probe_model_to_probe(pm) for pm in job_model.probes]
|
|
139
145
|
return JobSubmission(
|
|
140
146
|
id=job_model.id,
|
|
141
147
|
submission_num=job_model.submission_num,
|
|
142
148
|
deployment_num=job_model.deployment_num,
|
|
143
|
-
submitted_at=job_model.submitted_at
|
|
149
|
+
submitted_at=job_model.submitted_at,
|
|
144
150
|
last_processed_at=last_processed_at,
|
|
145
151
|
finished_at=finished_at,
|
|
146
152
|
inactivity_secs=job_model.inactivity_secs,
|
|
@@ -152,6 +158,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
152
158
|
job_provisioning_data=job_provisioning_data,
|
|
153
159
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
154
160
|
error=error,
|
|
161
|
+
probes=probes,
|
|
155
162
|
)
|
|
156
163
|
|
|
157
164
|
|
|
@@ -231,10 +238,7 @@ async def process_terminating_job(
|
|
|
231
238
|
Graceful stop should already be done by `process_terminating_run`.
|
|
232
239
|
Caller must acquire the locks on the job and the job's instance.
|
|
233
240
|
"""
|
|
234
|
-
if (
|
|
235
|
-
job_model.remove_at is not None
|
|
236
|
-
and job_model.remove_at.replace(tzinfo=timezone.utc) > common.get_current_datetime()
|
|
237
|
-
):
|
|
241
|
+
if job_model.remove_at is not None and job_model.remove_at > common.get_current_datetime():
|
|
238
242
|
# it's too early to terminate the instance
|
|
239
243
|
return
|
|
240
244
|
|
|
@@ -550,24 +554,25 @@ def _should_force_detach_volume(job_model: JobModel, stop_duration: Optional[int
|
|
|
550
554
|
return (
|
|
551
555
|
job_model.volumes_detached_at is not None
|
|
552
556
|
and common.get_current_datetime()
|
|
553
|
-
> job_model.volumes_detached_at
|
|
557
|
+
> job_model.volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
|
|
554
558
|
and (
|
|
555
559
|
job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER
|
|
556
560
|
or stop_duration is not None
|
|
557
561
|
and common.get_current_datetime()
|
|
558
|
-
> job_model.volumes_detached_at
|
|
559
|
-
+ timedelta(seconds=stop_duration)
|
|
562
|
+
> job_model.volumes_detached_at + timedelta(seconds=stop_duration)
|
|
560
563
|
)
|
|
561
564
|
)
|
|
562
565
|
|
|
563
566
|
|
|
564
567
|
async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]:
|
|
565
568
|
res = await session.execute(
|
|
566
|
-
select(JobModel)
|
|
569
|
+
select(JobModel)
|
|
570
|
+
.where(
|
|
567
571
|
JobModel.status == JobStatus.TERMINATING,
|
|
568
572
|
JobModel.used_instance_id.is_not(None),
|
|
569
573
|
JobModel.volumes_detached_at.is_not(None),
|
|
570
574
|
)
|
|
575
|
+
.options(load_only(JobModel.used_instance_id))
|
|
571
576
|
)
|
|
572
577
|
job_models = res.scalars().all()
|
|
573
578
|
return [jm.used_instance_id for jm in job_models if jm.used_instance_id]
|
|
@@ -11,8 +11,14 @@ from dstack._internal import settings
|
|
|
11
11
|
from dstack._internal.core.errors import DockerRegistryError, ServerClientError
|
|
12
12
|
from dstack._internal.core.models.common import RegistryAuth
|
|
13
13
|
from dstack._internal.core.models.configurations import (
|
|
14
|
+
DEFAULT_PROBE_INTERVAL,
|
|
15
|
+
DEFAULT_PROBE_METHOD,
|
|
16
|
+
DEFAULT_PROBE_READY_AFTER,
|
|
17
|
+
DEFAULT_PROBE_TIMEOUT,
|
|
18
|
+
DEFAULT_PROBE_URL,
|
|
14
19
|
DEFAULT_REPO_DIR,
|
|
15
20
|
PortMapping,
|
|
21
|
+
ProbeConfig,
|
|
16
22
|
PythonVersion,
|
|
17
23
|
RunConfigurationType,
|
|
18
24
|
ServiceConfiguration,
|
|
@@ -26,6 +32,7 @@ from dstack._internal.core.models.runs import (
|
|
|
26
32
|
AppSpec,
|
|
27
33
|
JobSpec,
|
|
28
34
|
JobSSHKey,
|
|
35
|
+
ProbeSpec,
|
|
29
36
|
Requirements,
|
|
30
37
|
Retry,
|
|
31
38
|
RunSpec,
|
|
@@ -155,6 +162,7 @@ class JobConfigurator(ABC):
|
|
|
155
162
|
repo_code_hash=self.run_spec.repo_code_hash,
|
|
156
163
|
file_archives=self.run_spec.file_archives,
|
|
157
164
|
service_port=self._service_port(),
|
|
165
|
+
probes=self._probes(),
|
|
158
166
|
)
|
|
159
167
|
return job_spec
|
|
160
168
|
|
|
@@ -313,6 +321,11 @@ class JobConfigurator(ABC):
|
|
|
313
321
|
return self.run_spec.configuration.port.container_port
|
|
314
322
|
return None
|
|
315
323
|
|
|
324
|
+
def _probes(self) -> list[ProbeSpec]:
|
|
325
|
+
if isinstance(self.run_spec.configuration, ServiceConfiguration):
|
|
326
|
+
return list(map(_probe_config_to_spec, self.run_spec.configuration.probes))
|
|
327
|
+
return []
|
|
328
|
+
|
|
316
329
|
|
|
317
330
|
def interpolate_job_volumes(
|
|
318
331
|
run_volumes: List[Union[MountPoint, str]],
|
|
@@ -353,6 +366,19 @@ def interpolate_job_volumes(
|
|
|
353
366
|
return job_volumes
|
|
354
367
|
|
|
355
368
|
|
|
369
|
+
def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec:
|
|
370
|
+
return ProbeSpec(
|
|
371
|
+
type=c.type,
|
|
372
|
+
url=c.url if c.url is not None else DEFAULT_PROBE_URL,
|
|
373
|
+
timeout=c.timeout if c.timeout is not None else DEFAULT_PROBE_TIMEOUT,
|
|
374
|
+
interval=c.interval if c.interval is not None else DEFAULT_PROBE_INTERVAL,
|
|
375
|
+
ready_after=c.ready_after if c.ready_after is not None else DEFAULT_PROBE_READY_AFTER,
|
|
376
|
+
method=c.method if c.method is not None else DEFAULT_PROBE_METHOD,
|
|
377
|
+
headers=c.headers,
|
|
378
|
+
body=c.body,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
356
382
|
def _join_shell_commands(commands: List[str]) -> str:
|
|
357
383
|
for i, cmd in enumerate(commands):
|
|
358
384
|
cmd = cmd.strip()
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from typing import Union
|
|
2
2
|
|
|
3
|
-
from dstack._internal.server.models import GatewayModel, JobModel, RunModel
|
|
3
|
+
from dstack._internal.server.models import GatewayModel, JobModel, ProbeModel, RunModel
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
|
|
6
|
+
def fmt(model: Union[RunModel, JobModel, GatewayModel, ProbeModel]) -> str:
|
|
7
7
|
"""Consistent string representation of a model for logging."""
|
|
8
8
|
if isinstance(model, RunModel):
|
|
9
9
|
return f"run({model.id.hex[:6]}){model.run_name}"
|
|
@@ -11,4 +11,6 @@ def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
|
|
|
11
11
|
return f"job({model.id.hex[:6]}){model.job_name}"
|
|
12
12
|
if isinstance(model, GatewayModel):
|
|
13
13
|
return f"gateway({model.id.hex[:6]}){model.name}"
|
|
14
|
+
if isinstance(model, ProbeModel):
|
|
15
|
+
return f"probe({model.id.hex[:6]}){model.name}"
|
|
14
16
|
return str(model)
|