dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +8 -3
- dstack/_internal/cli/services/configurators/__init__.py +8 -0
- dstack/_internal/cli/services/configurators/fleet.py +1 -1
- dstack/_internal/cli/services/configurators/gateway.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +11 -1
- dstack/_internal/cli/services/configurators/volume.py +1 -1
- dstack/_internal/cli/utils/common.py +48 -5
- dstack/_internal/cli/utils/fleet.py +5 -5
- dstack/_internal/cli/utils/run.py +32 -0
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +225 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/backends/nebius/compute.py +8 -2
- dstack/_internal/core/backends/nebius/fabrics.py +1 -0
- dstack/_internal/core/backends/nebius/resources.py +9 -0
- dstack/_internal/core/compatibility/runs.py +8 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +139 -1
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +2 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/runs.py +21 -1
- dstack/_internal/core/services/ssh/tunnel.py +7 -0
- dstack/_internal/server/app.py +4 -0
- dstack/_internal/server/background/__init__.py +4 -0
- dstack/_internal/server/background/tasks/process_instances.py +107 -56
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_running_jobs.py +13 -0
- dstack/_internal/server/background/tasks/process_runs.py +21 -14
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/models.py +41 -0
- dstack/_internal/server/routers/instances.py +33 -5
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +32 -0
- dstack/_internal/server/schemas/runner.py +5 -0
- dstack/_internal/server/services/instances.py +103 -1
- dstack/_internal/server/services/jobs/__init__.py +8 -1
- dstack/_internal/server/services/jobs/configurators/base.py +26 -0
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/aws.py +13 -1
- dstack/_internal/server/services/logs/gcp.py +16 -1
- dstack/_internal/server/services/probes.py +6 -0
- dstack/_internal/server/services/projects.py +16 -4
- dstack/_internal/server/services/runner/client.py +52 -20
- dstack/_internal/server/services/runner/ssh.py +4 -4
- dstack/_internal/server/services/runs.py +49 -13
- dstack/_internal/server/services/ssh.py +66 -0
- dstack/_internal/server/settings.py +13 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
- dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
- dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
- dstack/_internal/server/testing/common.py +44 -0
- dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +15 -0
- dstack/api/server/__init__.py +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/METADATA +14 -14
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/RECORD +71 -58
- /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/WHEEL +0 -0
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -28,6 +28,7 @@ from dstack._internal.core.models.backends.base import BackendType
|
|
|
28
28
|
from dstack._internal.core.models.common import CoreModel
|
|
29
29
|
from dstack._internal.core.models.fleets import FleetStatus
|
|
30
30
|
from dstack._internal.core.models.gateways import GatewayStatus
|
|
31
|
+
from dstack._internal.core.models.health import HealthStatus
|
|
31
32
|
from dstack._internal.core.models.instances import InstanceStatus
|
|
32
33
|
from dstack._internal.core.models.profiles import (
|
|
33
34
|
DEFAULT_FLEET_TERMINATION_IDLE_TIME,
|
|
@@ -427,6 +428,9 @@ class JobModel(BaseModel):
|
|
|
427
428
|
replica_num: Mapped[int] = mapped_column(Integer)
|
|
428
429
|
deployment_num: Mapped[int] = mapped_column(Integer)
|
|
429
430
|
job_runtime_data: Mapped[Optional[str]] = mapped_column(Text)
|
|
431
|
+
probes: Mapped[list["ProbeModel"]] = relationship(
|
|
432
|
+
back_populates="job", order_by="ProbeModel.probe_num"
|
|
433
|
+
)
|
|
430
434
|
|
|
431
435
|
|
|
432
436
|
class GatewayModel(BaseModel):
|
|
@@ -596,7 +600,11 @@ class InstanceModel(BaseModel):
|
|
|
596
600
|
# instance termination handling
|
|
597
601
|
termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
598
602
|
termination_reason: Mapped[Optional[str]] = mapped_column(String(4000))
|
|
603
|
+
# Deprecated since 0.19.22, not used
|
|
599
604
|
health_status: Mapped[Optional[str]] = mapped_column(String(4000))
|
|
605
|
+
health: Mapped[HealthStatus] = mapped_column(
|
|
606
|
+
EnumAsString(HealthStatus, 100), default=HealthStatus.HEALTHY
|
|
607
|
+
)
|
|
600
608
|
first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
601
609
|
last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
|
|
602
610
|
|
|
@@ -627,6 +635,21 @@ class InstanceModel(BaseModel):
|
|
|
627
635
|
)
|
|
628
636
|
|
|
629
637
|
|
|
638
|
+
class InstanceHealthCheckModel(BaseModel):
|
|
639
|
+
__tablename__ = "instance_health_checks"
|
|
640
|
+
|
|
641
|
+
id: Mapped[uuid.UUID] = mapped_column(
|
|
642
|
+
UUIDType(binary=False), primary_key=True, default=uuid.uuid4
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"))
|
|
646
|
+
instance: Mapped["InstanceModel"] = relationship()
|
|
647
|
+
|
|
648
|
+
collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
649
|
+
status: Mapped[HealthStatus] = mapped_column(EnumAsString(HealthStatus, 100))
|
|
650
|
+
response: Mapped[str] = mapped_column(Text)
|
|
651
|
+
|
|
652
|
+
|
|
630
653
|
class VolumeModel(BaseModel):
|
|
631
654
|
__tablename__ = "volumes"
|
|
632
655
|
|
|
@@ -729,6 +752,24 @@ class JobPrometheusMetrics(BaseModel):
|
|
|
729
752
|
text: Mapped[str] = mapped_column(Text)
|
|
730
753
|
|
|
731
754
|
|
|
755
|
+
class ProbeModel(BaseModel):
|
|
756
|
+
__tablename__ = "probes"
|
|
757
|
+
__table_args__ = (UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),)
|
|
758
|
+
|
|
759
|
+
id: Mapped[uuid.UUID] = mapped_column(
|
|
760
|
+
UUIDType(binary=False), primary_key=True, default=uuid.uuid4
|
|
761
|
+
)
|
|
762
|
+
name: Mapped[str] = mapped_column(String(100))
|
|
763
|
+
|
|
764
|
+
job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
|
|
765
|
+
job: Mapped["JobModel"] = relationship(back_populates="probes")
|
|
766
|
+
|
|
767
|
+
probe_num: Mapped[int] = mapped_column(Integer) # index in JobSpec.probes
|
|
768
|
+
due: Mapped[datetime] = mapped_column(NaiveDateTime)
|
|
769
|
+
success_streak: Mapped[int] = mapped_column(BigInteger)
|
|
770
|
+
active: Mapped[bool] = mapped_column(Boolean)
|
|
771
|
+
|
|
772
|
+
|
|
732
773
|
class SecretModel(BaseModel):
|
|
733
774
|
__tablename__ = "secrets"
|
|
734
775
|
__table_args__ = (UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),)
|
|
@@ -3,12 +3,16 @@ from typing import List
|
|
|
3
3
|
from fastapi import APIRouter, Depends
|
|
4
4
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
5
5
|
|
|
6
|
-
import dstack._internal.server.services.instances as
|
|
6
|
+
import dstack._internal.server.services.instances as instances_services
|
|
7
7
|
from dstack._internal.core.models.instances import Instance
|
|
8
8
|
from dstack._internal.server.db import get_session
|
|
9
|
-
from dstack._internal.server.models import UserModel
|
|
10
|
-
from dstack._internal.server.schemas.instances import
|
|
11
|
-
|
|
9
|
+
from dstack._internal.server.models import ProjectModel, UserModel
|
|
10
|
+
from dstack._internal.server.schemas.instances import (
|
|
11
|
+
GetInstanceHealthChecksRequest,
|
|
12
|
+
GetInstanceHealthChecksResponse,
|
|
13
|
+
ListInstancesRequest,
|
|
14
|
+
)
|
|
15
|
+
from dstack._internal.server.security.permissions import Authenticated, ProjectMember
|
|
12
16
|
from dstack._internal.server.utils.routers import (
|
|
13
17
|
CustomORJSONResponse,
|
|
14
18
|
get_base_api_additional_responses,
|
|
@@ -19,6 +23,11 @@ root_router = APIRouter(
|
|
|
19
23
|
tags=["instances"],
|
|
20
24
|
responses=get_base_api_additional_responses(),
|
|
21
25
|
)
|
|
26
|
+
project_router = APIRouter(
|
|
27
|
+
prefix="/api/project/{project_name}/instances",
|
|
28
|
+
tags=["instances"],
|
|
29
|
+
responses=get_base_api_additional_responses(),
|
|
30
|
+
)
|
|
22
31
|
|
|
23
32
|
|
|
24
33
|
@root_router.post("/list", response_model=List[Instance])
|
|
@@ -35,7 +44,7 @@ async def list_instances(
|
|
|
35
44
|
the last instance from the previous page as `prev_created_at` and `prev_id`.
|
|
36
45
|
"""
|
|
37
46
|
return CustomORJSONResponse(
|
|
38
|
-
await
|
|
47
|
+
await instances_services.list_user_instances(
|
|
39
48
|
session=session,
|
|
40
49
|
user=user,
|
|
41
50
|
project_names=body.project_names,
|
|
@@ -47,3 +56,22 @@ async def list_instances(
|
|
|
47
56
|
ascending=body.ascending,
|
|
48
57
|
)
|
|
49
58
|
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@project_router.post("/get_instance_health_checks", response_model=GetInstanceHealthChecksResponse)
|
|
62
|
+
async def get_instance_health_checks(
|
|
63
|
+
body: GetInstanceHealthChecksRequest,
|
|
64
|
+
session: AsyncSession = Depends(get_session),
|
|
65
|
+
user_project: tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
|
|
66
|
+
):
|
|
67
|
+
_, project = user_project
|
|
68
|
+
health_checks = await instances_services.get_instance_health_checks(
|
|
69
|
+
session=session,
|
|
70
|
+
project=project,
|
|
71
|
+
fleet_name=body.fleet_name,
|
|
72
|
+
instance_num=body.instance_num,
|
|
73
|
+
after=body.after,
|
|
74
|
+
before=body.before,
|
|
75
|
+
limit=body.limit,
|
|
76
|
+
)
|
|
77
|
+
return CustomORJSONResponse(GetInstanceHealthChecksResponse(health_checks=health_checks))
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from enum import IntEnum
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.models.common import CoreModel
|
|
4
|
+
from dstack._internal.core.models.health import HealthStatus
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DCGMHealthResult(IntEnum):
|
|
8
|
+
"""
|
|
9
|
+
`dcgmHealthWatchResult_enum`
|
|
10
|
+
|
|
11
|
+
See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/const.go#L1020-L1026
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
DCGM_HEALTH_RESULT_PASS = 0
|
|
15
|
+
DCGM_HEALTH_RESULT_WARN = 10
|
|
16
|
+
DCGM_HEALTH_RESULT_FAIL = 20
|
|
17
|
+
|
|
18
|
+
def to_health_status(self) -> HealthStatus:
|
|
19
|
+
if self == self.DCGM_HEALTH_RESULT_PASS:
|
|
20
|
+
return HealthStatus.HEALTHY
|
|
21
|
+
if self == self.DCGM_HEALTH_RESULT_WARN:
|
|
22
|
+
return HealthStatus.WARNING
|
|
23
|
+
if self == self.DCGM_HEALTH_RESULT_FAIL:
|
|
24
|
+
return HealthStatus.FAILURE
|
|
25
|
+
raise AssertionError("should not reach here")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DCGMHealthIncident(CoreModel):
|
|
29
|
+
"""
|
|
30
|
+
Flattened `dcgmIncidentInfo_t`
|
|
31
|
+
|
|
32
|
+
See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L68-L73
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# dcgmIncidentInfo_t
|
|
36
|
+
system: int
|
|
37
|
+
health: DCGMHealthResult
|
|
38
|
+
|
|
39
|
+
# dcgmDiagErrorDetail_t
|
|
40
|
+
error_message: str
|
|
41
|
+
error_code: int
|
|
42
|
+
|
|
43
|
+
# dcgmGroupEntityPair_t
|
|
44
|
+
entity_group_id: int # dcgmGroupEntityPair_t
|
|
45
|
+
entity_id: int
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DCGMHealthResponse(CoreModel):
|
|
49
|
+
"""
|
|
50
|
+
`dcgmHealthResponse_v5`
|
|
51
|
+
|
|
52
|
+
See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L75-L78
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
overall_health: DCGMHealthResult
|
|
56
|
+
incidents: list[DCGMHealthIncident]
|
|
@@ -3,6 +3,8 @@ from typing import Optional
|
|
|
3
3
|
from uuid import UUID
|
|
4
4
|
|
|
5
5
|
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
from dstack._internal.core.models.health import HealthCheck, HealthStatus
|
|
7
|
+
from dstack._internal.server.schemas.runner import InstanceHealthResponse
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
class ListInstancesRequest(CoreModel):
|
|
@@ -13,3 +15,33 @@ class ListInstancesRequest(CoreModel):
|
|
|
13
15
|
prev_id: Optional[UUID] = None
|
|
14
16
|
limit: int = 1000
|
|
15
17
|
ascending: bool = False
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class InstanceCheck(CoreModel):
|
|
21
|
+
reachable: bool
|
|
22
|
+
message: Optional[str] = None
|
|
23
|
+
health_response: Optional[InstanceHealthResponse] = None
|
|
24
|
+
|
|
25
|
+
def get_health_status(self) -> HealthStatus:
|
|
26
|
+
if self.health_response is None:
|
|
27
|
+
return HealthStatus.HEALTHY
|
|
28
|
+
if self.health_response.dcgm is None:
|
|
29
|
+
return HealthStatus.HEALTHY
|
|
30
|
+
return self.health_response.dcgm.overall_health.to_health_status()
|
|
31
|
+
|
|
32
|
+
def has_health_checks(self) -> bool:
|
|
33
|
+
if self.health_response is None:
|
|
34
|
+
return False
|
|
35
|
+
return self.health_response.dcgm is not None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GetInstanceHealthChecksRequest(CoreModel):
|
|
39
|
+
fleet_name: str
|
|
40
|
+
instance_num: int
|
|
41
|
+
after: Optional[datetime] = None
|
|
42
|
+
before: Optional[datetime] = None
|
|
43
|
+
limit: Optional[int] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class GetInstanceHealthChecksResponse(CoreModel):
|
|
47
|
+
health_checks: list[HealthCheck]
|
|
@@ -16,6 +16,7 @@ from dstack._internal.core.models.runs import (
|
|
|
16
16
|
RunSpec,
|
|
17
17
|
)
|
|
18
18
|
from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
|
|
19
|
+
from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class JobStateEvent(CoreModel):
|
|
@@ -114,6 +115,10 @@ class HealthcheckResponse(CoreModel):
|
|
|
114
115
|
version: str
|
|
115
116
|
|
|
116
117
|
|
|
118
|
+
class InstanceHealthResponse(CoreModel):
|
|
119
|
+
dcgm: Optional[DCGMHealthResponse] = None
|
|
120
|
+
|
|
121
|
+
|
|
117
122
|
class GPUMetrics(CoreModel):
|
|
118
123
|
gpu_memory_usage_bytes: int
|
|
119
124
|
gpu_util_percent: int
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import operator
|
|
1
2
|
import uuid
|
|
2
3
|
from collections.abc import Container, Iterable
|
|
3
4
|
from datetime import datetime
|
|
@@ -6,15 +7,17 @@ from typing import Dict, List, Literal, Optional, Union
|
|
|
6
7
|
import gpuhunt
|
|
7
8
|
from sqlalchemy import and_, or_, select
|
|
8
9
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
9
|
-
from sqlalchemy.orm import joinedload
|
|
10
|
+
from sqlalchemy.orm import joinedload, load_only
|
|
10
11
|
|
|
11
12
|
from dstack._internal.core.backends.base.offers import (
|
|
12
13
|
offer_to_catalog_item,
|
|
13
14
|
requirements_to_query_filter,
|
|
14
15
|
)
|
|
15
16
|
from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
|
|
17
|
+
from dstack._internal.core.errors import ResourceNotExistsError
|
|
16
18
|
from dstack._internal.core.models.backends.base import BackendType
|
|
17
19
|
from dstack._internal.core.models.envs import Env
|
|
20
|
+
from dstack._internal.core.models.health import HealthCheck, HealthEvent, HealthStatus
|
|
18
21
|
from dstack._internal.core.models.instances import (
|
|
19
22
|
Instance,
|
|
20
23
|
InstanceAvailability,
|
|
@@ -38,10 +41,13 @@ from dstack._internal.core.models.volumes import Volume
|
|
|
38
41
|
from dstack._internal.core.services.profiles import get_termination
|
|
39
42
|
from dstack._internal.server.models import (
|
|
40
43
|
FleetModel,
|
|
44
|
+
InstanceHealthCheckModel,
|
|
41
45
|
InstanceModel,
|
|
42
46
|
ProjectModel,
|
|
43
47
|
UserModel,
|
|
44
48
|
)
|
|
49
|
+
from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
|
|
50
|
+
from dstack._internal.server.schemas.runner import InstanceHealthResponse
|
|
45
51
|
from dstack._internal.server.services.offers import generate_shared_offer
|
|
46
52
|
from dstack._internal.server.services.projects import list_user_project_models
|
|
47
53
|
from dstack._internal.utils import common as common_utils
|
|
@@ -50,6 +56,57 @@ from dstack._internal.utils.logging import get_logger
|
|
|
50
56
|
logger = get_logger(__name__)
|
|
51
57
|
|
|
52
58
|
|
|
59
|
+
async def get_instance_health_checks(
|
|
60
|
+
session: AsyncSession,
|
|
61
|
+
project: ProjectModel,
|
|
62
|
+
fleet_name: str,
|
|
63
|
+
instance_num: int,
|
|
64
|
+
after: Optional[datetime] = None,
|
|
65
|
+
before: Optional[datetime] = None,
|
|
66
|
+
limit: Optional[int] = None,
|
|
67
|
+
) -> list[HealthCheck]:
|
|
68
|
+
"""
|
|
69
|
+
Returns instance health checks ordered from the latest to the earliest.
|
|
70
|
+
|
|
71
|
+
Expected usage:
|
|
72
|
+
* limit=100 — get the latest 100 checks
|
|
73
|
+
* after=<now - 1 hour> — get checks for the last hour
|
|
74
|
+
* before=<earliest timestamp from the last batch>, limit=100 — paginate back in history
|
|
75
|
+
"""
|
|
76
|
+
res = await session.execute(
|
|
77
|
+
select(InstanceModel)
|
|
78
|
+
.join(FleetModel)
|
|
79
|
+
.where(
|
|
80
|
+
~InstanceModel.deleted,
|
|
81
|
+
InstanceModel.project_id == project.id,
|
|
82
|
+
InstanceModel.instance_num == instance_num,
|
|
83
|
+
FleetModel.name == fleet_name,
|
|
84
|
+
)
|
|
85
|
+
.options(load_only(InstanceModel.id))
|
|
86
|
+
)
|
|
87
|
+
instance = res.scalar_one_or_none()
|
|
88
|
+
if instance is None:
|
|
89
|
+
raise ResourceNotExistsError()
|
|
90
|
+
|
|
91
|
+
stmt = (
|
|
92
|
+
select(InstanceHealthCheckModel)
|
|
93
|
+
.where(InstanceHealthCheckModel.instance_id == instance.id)
|
|
94
|
+
.order_by(InstanceHealthCheckModel.collected_at.desc())
|
|
95
|
+
)
|
|
96
|
+
if after is not None:
|
|
97
|
+
stmt = stmt.where(InstanceHealthCheckModel.collected_at > after)
|
|
98
|
+
if before is not None:
|
|
99
|
+
stmt = stmt.where(InstanceHealthCheckModel.collected_at < before)
|
|
100
|
+
if limit is not None:
|
|
101
|
+
stmt = stmt.limit(limit)
|
|
102
|
+
health_checks: list[HealthCheck] = []
|
|
103
|
+
res = await session.execute(stmt)
|
|
104
|
+
for health_check_model in res.scalars():
|
|
105
|
+
health_check = instance_health_check_model_to_health_check(health_check_model)
|
|
106
|
+
health_checks.append(health_check)
|
|
107
|
+
return health_checks
|
|
108
|
+
|
|
109
|
+
|
|
53
110
|
def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
54
111
|
instance = Instance(
|
|
55
112
|
id=instance_model.id,
|
|
@@ -60,6 +117,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
|
60
117
|
instance_num=instance_model.instance_num,
|
|
61
118
|
status=instance_model.status,
|
|
62
119
|
unreachable=instance_model.unreachable,
|
|
120
|
+
health_status=instance_model.health,
|
|
63
121
|
termination_reason=instance_model.termination_reason,
|
|
64
122
|
created=instance_model.created_at,
|
|
65
123
|
total_blocks=instance_model.total_blocks,
|
|
@@ -81,6 +139,48 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
|
|
|
81
139
|
return instance
|
|
82
140
|
|
|
83
141
|
|
|
142
|
+
def instance_health_check_model_to_health_check(model: InstanceHealthCheckModel) -> HealthCheck:
|
|
143
|
+
collected_at = model.collected_at
|
|
144
|
+
status = HealthStatus.HEALTHY
|
|
145
|
+
events: list[HealthEvent] = []
|
|
146
|
+
instance_health_response = get_instance_health_response(model)
|
|
147
|
+
if (dcgm := instance_health_response.dcgm) is not None:
|
|
148
|
+
dcgm_health_check = dcgm_health_response_to_health_check(dcgm, collected_at)
|
|
149
|
+
status = dcgm_health_check.status
|
|
150
|
+
events.extend(dcgm_health_check.events)
|
|
151
|
+
events.sort(key=operator.attrgetter("timestamp"), reverse=True)
|
|
152
|
+
return HealthCheck(
|
|
153
|
+
collected_at=collected_at,
|
|
154
|
+
status=status,
|
|
155
|
+
events=events,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def dcgm_health_response_to_health_check(
|
|
160
|
+
response: DCGMHealthResponse, collected_at: datetime
|
|
161
|
+
) -> HealthCheck:
|
|
162
|
+
events: list[HealthEvent] = []
|
|
163
|
+
for incident in response.incidents:
|
|
164
|
+
events.append(
|
|
165
|
+
HealthEvent(
|
|
166
|
+
timestamp=collected_at,
|
|
167
|
+
status=incident.health.to_health_status(),
|
|
168
|
+
message=incident.error_message,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
return HealthCheck(
|
|
172
|
+
collected_at=collected_at,
|
|
173
|
+
status=response.overall_health.to_health_status(),
|
|
174
|
+
events=events,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def get_instance_health_response(
|
|
179
|
+
instance_health_check_model: InstanceHealthCheckModel,
|
|
180
|
+
) -> InstanceHealthResponse:
|
|
181
|
+
return InstanceHealthResponse.__response__.parse_raw(instance_health_check_model.response)
|
|
182
|
+
|
|
183
|
+
|
|
84
184
|
def get_instance_provisioning_data(instance_model: InstanceModel) -> Optional[JobProvisioningData]:
|
|
85
185
|
if instance_model.job_provisioning_data is None:
|
|
86
186
|
return None
|
|
@@ -194,6 +294,8 @@ def filter_pool_instances(
|
|
|
194
294
|
continue
|
|
195
295
|
if instance.unreachable:
|
|
196
296
|
continue
|
|
297
|
+
if instance.health.is_failure():
|
|
298
|
+
continue
|
|
197
299
|
fleet = instance.fleet
|
|
198
300
|
if profile.fleets is not None and (fleet is None or fleet.name not in profile.fleets):
|
|
199
301
|
continue
|
|
@@ -52,6 +52,7 @@ from dstack._internal.server.services.jobs.configurators.dev import DevEnvironme
|
|
|
52
52
|
from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator
|
|
53
53
|
from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator
|
|
54
54
|
from dstack._internal.server.services.logging import fmt
|
|
55
|
+
from dstack._internal.server.services.probes import probe_model_to_probe
|
|
55
56
|
from dstack._internal.server.services.runner import client
|
|
56
57
|
from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
|
|
57
58
|
from dstack._internal.server.services.volumes import (
|
|
@@ -115,7 +116,9 @@ async def get_run_job_model(
|
|
|
115
116
|
return res.scalar_one_or_none()
|
|
116
117
|
|
|
117
118
|
|
|
118
|
-
def job_model_to_job_submission(
|
|
119
|
+
def job_model_to_job_submission(
|
|
120
|
+
job_model: JobModel, include_probes: bool = False
|
|
121
|
+
) -> JobSubmission:
|
|
119
122
|
job_provisioning_data = get_job_provisioning_data(job_model)
|
|
120
123
|
if job_provisioning_data is not None:
|
|
121
124
|
# TODO remove after transitioning to computed fields
|
|
@@ -136,6 +139,9 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
136
139
|
finished_at = last_processed_at
|
|
137
140
|
status_message = _get_job_status_message(job_model)
|
|
138
141
|
error = _get_job_error(job_model)
|
|
142
|
+
probes = []
|
|
143
|
+
if include_probes:
|
|
144
|
+
probes = [probe_model_to_probe(pm) for pm in job_model.probes]
|
|
139
145
|
return JobSubmission(
|
|
140
146
|
id=job_model.id,
|
|
141
147
|
submission_num=job_model.submission_num,
|
|
@@ -152,6 +158,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
|
|
|
152
158
|
job_provisioning_data=job_provisioning_data,
|
|
153
159
|
job_runtime_data=get_job_runtime_data(job_model),
|
|
154
160
|
error=error,
|
|
161
|
+
probes=probes,
|
|
155
162
|
)
|
|
156
163
|
|
|
157
164
|
|
|
@@ -11,8 +11,14 @@ from dstack._internal import settings
|
|
|
11
11
|
from dstack._internal.core.errors import DockerRegistryError, ServerClientError
|
|
12
12
|
from dstack._internal.core.models.common import RegistryAuth
|
|
13
13
|
from dstack._internal.core.models.configurations import (
|
|
14
|
+
DEFAULT_PROBE_INTERVAL,
|
|
15
|
+
DEFAULT_PROBE_METHOD,
|
|
16
|
+
DEFAULT_PROBE_READY_AFTER,
|
|
17
|
+
DEFAULT_PROBE_TIMEOUT,
|
|
18
|
+
DEFAULT_PROBE_URL,
|
|
14
19
|
DEFAULT_REPO_DIR,
|
|
15
20
|
PortMapping,
|
|
21
|
+
ProbeConfig,
|
|
16
22
|
PythonVersion,
|
|
17
23
|
RunConfigurationType,
|
|
18
24
|
ServiceConfiguration,
|
|
@@ -26,6 +32,7 @@ from dstack._internal.core.models.runs import (
|
|
|
26
32
|
AppSpec,
|
|
27
33
|
JobSpec,
|
|
28
34
|
JobSSHKey,
|
|
35
|
+
ProbeSpec,
|
|
29
36
|
Requirements,
|
|
30
37
|
Retry,
|
|
31
38
|
RunSpec,
|
|
@@ -155,6 +162,7 @@ class JobConfigurator(ABC):
|
|
|
155
162
|
repo_code_hash=self.run_spec.repo_code_hash,
|
|
156
163
|
file_archives=self.run_spec.file_archives,
|
|
157
164
|
service_port=self._service_port(),
|
|
165
|
+
probes=self._probes(),
|
|
158
166
|
)
|
|
159
167
|
return job_spec
|
|
160
168
|
|
|
@@ -313,6 +321,11 @@ class JobConfigurator(ABC):
|
|
|
313
321
|
return self.run_spec.configuration.port.container_port
|
|
314
322
|
return None
|
|
315
323
|
|
|
324
|
+
def _probes(self) -> list[ProbeSpec]:
|
|
325
|
+
if isinstance(self.run_spec.configuration, ServiceConfiguration):
|
|
326
|
+
return list(map(_probe_config_to_spec, self.run_spec.configuration.probes))
|
|
327
|
+
return []
|
|
328
|
+
|
|
316
329
|
|
|
317
330
|
def interpolate_job_volumes(
|
|
318
331
|
run_volumes: List[Union[MountPoint, str]],
|
|
@@ -353,6 +366,19 @@ def interpolate_job_volumes(
|
|
|
353
366
|
return job_volumes
|
|
354
367
|
|
|
355
368
|
|
|
369
|
+
def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec:
|
|
370
|
+
return ProbeSpec(
|
|
371
|
+
type=c.type,
|
|
372
|
+
url=c.url if c.url is not None else DEFAULT_PROBE_URL,
|
|
373
|
+
timeout=c.timeout if c.timeout is not None else DEFAULT_PROBE_TIMEOUT,
|
|
374
|
+
interval=c.interval if c.interval is not None else DEFAULT_PROBE_INTERVAL,
|
|
375
|
+
ready_after=c.ready_after if c.ready_after is not None else DEFAULT_PROBE_READY_AFTER,
|
|
376
|
+
method=c.method if c.method is not None else DEFAULT_PROBE_METHOD,
|
|
377
|
+
headers=c.headers,
|
|
378
|
+
body=c.body,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
356
382
|
def _join_shell_commands(commands: List[str]) -> str:
|
|
357
383
|
for i, cmd in enumerate(commands):
|
|
358
384
|
cmd = cmd.strip()
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from typing import Union
|
|
2
2
|
|
|
3
|
-
from dstack._internal.server.models import GatewayModel, JobModel, RunModel
|
|
3
|
+
from dstack._internal.server.models import GatewayModel, JobModel, ProbeModel, RunModel
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
|
|
6
|
+
def fmt(model: Union[RunModel, JobModel, GatewayModel, ProbeModel]) -> str:
|
|
7
7
|
"""Consistent string representation of a model for logging."""
|
|
8
8
|
if isinstance(model, RunModel):
|
|
9
9
|
return f"run({model.id.hex[:6]}){model.run_name}"
|
|
@@ -11,4 +11,6 @@ def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
|
|
|
11
11
|
return f"job({model.id.hex[:6]}){model.job_name}"
|
|
12
12
|
if isinstance(model, GatewayModel):
|
|
13
13
|
return f"gateway({model.id.hex[:6]}){model.name}"
|
|
14
|
+
if isinstance(model, ProbeModel):
|
|
15
|
+
return f"probe({model.id.hex[:6]}){model.name}"
|
|
14
16
|
return str(model)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import operator
|
|
3
|
+
import urllib
|
|
4
|
+
import urllib.parse
|
|
3
5
|
from contextlib import contextmanager
|
|
4
6
|
from datetime import datetime, timedelta, timezone
|
|
5
7
|
from typing import Iterator, List, Optional, Set, Tuple, TypedDict
|
|
@@ -64,6 +66,7 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
64
66
|
self._client = session.client("logs")
|
|
65
67
|
self._check_group_exists(group)
|
|
66
68
|
self._group = group
|
|
69
|
+
self._region = self._client.meta.region_name
|
|
67
70
|
# Stores names of already created streams.
|
|
68
71
|
# XXX: This set acts as an unbound cache. If this becomes a problem (in case of _very_ long
|
|
69
72
|
# running server and/or lots of jobs, consider replacing it with an LRU cache, e.g.,
|
|
@@ -103,7 +106,11 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
103
106
|
)
|
|
104
107
|
for cw_event in cw_events
|
|
105
108
|
]
|
|
106
|
-
return JobSubmissionLogs(
|
|
109
|
+
return JobSubmissionLogs(
|
|
110
|
+
logs=logs,
|
|
111
|
+
external_url=self._get_stream_external_url(stream),
|
|
112
|
+
next_token=next_token,
|
|
113
|
+
)
|
|
107
114
|
|
|
108
115
|
def _get_log_events_with_retry(
|
|
109
116
|
self, stream: str, request: PollLogsRequest
|
|
@@ -181,6 +188,11 @@ class CloudWatchLogStorage(LogStorage):
|
|
|
181
188
|
|
|
182
189
|
return events, next_token
|
|
183
190
|
|
|
191
|
+
def _get_stream_external_url(self, stream: str) -> str:
|
|
192
|
+
quoted_group = urllib.parse.quote(self._group, safe="")
|
|
193
|
+
quoted_stream = urllib.parse.quote(stream, safe="")
|
|
194
|
+
return f"https://console.aws.amazon.com/cloudwatch/home?region={self._region}#logsV2:log-groups/log-group/{quoted_group}/log-events/{quoted_stream}"
|
|
195
|
+
|
|
184
196
|
def write_logs(
|
|
185
197
|
self,
|
|
186
198
|
project: ProjectModel,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import urllib.parse
|
|
1
2
|
from typing import List
|
|
2
3
|
from uuid import UUID
|
|
3
4
|
|
|
@@ -48,6 +49,7 @@ class GCPLogStorage(LogStorage):
|
|
|
48
49
|
# (https://cloud.google.com/logging/docs/analyze/custom-index).
|
|
49
50
|
|
|
50
51
|
def __init__(self, project_id: str):
|
|
52
|
+
self.project_id = project_id
|
|
51
53
|
try:
|
|
52
54
|
self.client = logging_v2.Client(project=project_id)
|
|
53
55
|
self.logger = self.client.logger(name=self.LOG_NAME)
|
|
@@ -106,7 +108,11 @@ class GCPLogStorage(LogStorage):
|
|
|
106
108
|
"GCP Logging read request limit exceeded."
|
|
107
109
|
" It's recommended to increase default entries.list request quota from 60 per minute."
|
|
108
110
|
)
|
|
109
|
-
return JobSubmissionLogs(
|
|
111
|
+
return JobSubmissionLogs(
|
|
112
|
+
logs=logs,
|
|
113
|
+
external_url=self._get_stream_extrnal_url(stream_name),
|
|
114
|
+
next_token=next_token if len(logs) > 0 else None,
|
|
115
|
+
)
|
|
110
116
|
|
|
111
117
|
def write_logs(
|
|
112
118
|
self,
|
|
@@ -162,3 +168,12 @@ class GCPLogStorage(LogStorage):
|
|
|
162
168
|
self, project_name: str, run_name: str, job_submission_id: UUID, producer: LogProducer
|
|
163
169
|
) -> str:
|
|
164
170
|
return f"{project_name}-{run_name}-{job_submission_id}-{producer.value}"
|
|
171
|
+
|
|
172
|
+
def _get_stream_extrnal_url(self, stream_name: str) -> str:
|
|
173
|
+
log_name_resource_name = self._get_log_name_resource_name()
|
|
174
|
+
query = f'logName="{log_name_resource_name}" AND labels.stream="{stream_name}"'
|
|
175
|
+
quoted_query = urllib.parse.quote(query, safe="")
|
|
176
|
+
return f"https://console.cloud.google.com/logs/query;query={quoted_query}?project={self.project_id}"
|
|
177
|
+
|
|
178
|
+
def _get_log_name_resource_name(self) -> str:
|
|
179
|
+
return f"projects/{self.project_id}/logs/{self.LOG_NAME}"
|
|
@@ -197,6 +197,10 @@ async def set_project_members(
|
|
|
197
197
|
project: ProjectModel,
|
|
198
198
|
members: List[MemberSetting],
|
|
199
199
|
):
|
|
200
|
+
usernames = {m.username for m in members}
|
|
201
|
+
if len(usernames) != len(members):
|
|
202
|
+
raise ServerClientError("Cannot add same user multiple times")
|
|
203
|
+
|
|
200
204
|
project = await get_project_model_by_name_or_error(
|
|
201
205
|
session=session,
|
|
202
206
|
project_name=project.name,
|
|
@@ -245,6 +249,10 @@ async def add_project_members(
|
|
|
245
249
|
members: List[MemberSetting],
|
|
246
250
|
):
|
|
247
251
|
"""Add multiple members to a project."""
|
|
252
|
+
usernames = {m.username for m in members}
|
|
253
|
+
if len(usernames) != len(members):
|
|
254
|
+
raise ServerClientError("Cannot add same user multiple times")
|
|
255
|
+
|
|
248
256
|
project = await get_project_model_by_name_or_error(
|
|
249
257
|
session=session,
|
|
250
258
|
project_name=project.name,
|
|
@@ -259,7 +267,10 @@ async def add_project_members(
|
|
|
259
267
|
)
|
|
260
268
|
|
|
261
269
|
if not is_self_join_to_public:
|
|
262
|
-
if requesting_user_role not in [
|
|
270
|
+
if user.global_role != GlobalRole.ADMIN and requesting_user_role not in [
|
|
271
|
+
ProjectRole.ADMIN,
|
|
272
|
+
ProjectRole.MANAGER,
|
|
273
|
+
]:
|
|
263
274
|
raise ForbiddenError("Access denied: insufficient permissions to add members")
|
|
264
275
|
|
|
265
276
|
if user.global_role != GlobalRole.ADMIN and requesting_user_role == ProjectRole.MANAGER:
|
|
@@ -272,8 +283,6 @@ async def add_project_members(
|
|
|
272
283
|
if members[0].project_role != ProjectRole.USER:
|
|
273
284
|
raise ForbiddenError("Access denied: can only join public projects as user role")
|
|
274
285
|
|
|
275
|
-
usernames = [member.username for member in members]
|
|
276
|
-
|
|
277
286
|
res = await session.execute(
|
|
278
287
|
select(UserModel).where((UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)))
|
|
279
288
|
)
|
|
@@ -628,7 +637,10 @@ async def remove_project_members(
|
|
|
628
637
|
)
|
|
629
638
|
|
|
630
639
|
if not is_self_leave:
|
|
631
|
-
if requesting_user_role not in [
|
|
640
|
+
if user.global_role != GlobalRole.ADMIN and requesting_user_role not in [
|
|
641
|
+
ProjectRole.ADMIN,
|
|
642
|
+
ProjectRole.MANAGER,
|
|
643
|
+
]:
|
|
632
644
|
raise ForbiddenError("Access denied: insufficient permissions to remove members")
|
|
633
645
|
|
|
634
646
|
res = await session.execute(
|