dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (71) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/configurators.py +9 -0
  11. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  12. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  13. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  14. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  15. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  16. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  17. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  18. dstack/_internal/core/backends/models.py +8 -0
  19. dstack/_internal/core/backends/nebius/compute.py +8 -2
  20. dstack/_internal/core/backends/nebius/fabrics.py +1 -0
  21. dstack/_internal/core/backends/nebius/resources.py +9 -0
  22. dstack/_internal/core/compatibility/runs.py +8 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/runs.py +21 -1
  29. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  30. dstack/_internal/server/app.py +4 -0
  31. dstack/_internal/server/background/__init__.py +4 -0
  32. dstack/_internal/server/background/tasks/process_instances.py +107 -56
  33. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  34. dstack/_internal/server/background/tasks/process_running_jobs.py +13 -0
  35. dstack/_internal/server/background/tasks/process_runs.py +21 -14
  36. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  37. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  38. dstack/_internal/server/models.py +41 -0
  39. dstack/_internal/server/routers/instances.py +33 -5
  40. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  41. dstack/_internal/server/schemas/instances.py +32 -0
  42. dstack/_internal/server/schemas/runner.py +5 -0
  43. dstack/_internal/server/services/instances.py +103 -1
  44. dstack/_internal/server/services/jobs/__init__.py +8 -1
  45. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  46. dstack/_internal/server/services/logging.py +4 -2
  47. dstack/_internal/server/services/logs/aws.py +13 -1
  48. dstack/_internal/server/services/logs/gcp.py +16 -1
  49. dstack/_internal/server/services/probes.py +6 -0
  50. dstack/_internal/server/services/projects.py +16 -4
  51. dstack/_internal/server/services/runner/client.py +52 -20
  52. dstack/_internal/server/services/runner/ssh.py +4 -4
  53. dstack/_internal/server/services/runs.py +49 -13
  54. dstack/_internal/server/services/ssh.py +66 -0
  55. dstack/_internal/server/settings.py +13 -0
  56. dstack/_internal/server/statics/index.html +1 -1
  57. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  58. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  59. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  60. dstack/_internal/server/testing/common.py +44 -0
  61. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  62. dstack/_internal/settings.py +3 -0
  63. dstack/_internal/utils/common.py +15 -0
  64. dstack/api/server/__init__.py +1 -1
  65. dstack/version.py +1 -1
  66. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/METADATA +14 -14
  67. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/RECORD +71 -58
  68. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  69. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/WHEEL +0 -0
  70. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/entry_points.txt +0 -0
  71. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -28,6 +28,7 @@ from dstack._internal.core.models.backends.base import BackendType
28
28
  from dstack._internal.core.models.common import CoreModel
29
29
  from dstack._internal.core.models.fleets import FleetStatus
30
30
  from dstack._internal.core.models.gateways import GatewayStatus
31
+ from dstack._internal.core.models.health import HealthStatus
31
32
  from dstack._internal.core.models.instances import InstanceStatus
32
33
  from dstack._internal.core.models.profiles import (
33
34
  DEFAULT_FLEET_TERMINATION_IDLE_TIME,
@@ -427,6 +428,9 @@ class JobModel(BaseModel):
427
428
  replica_num: Mapped[int] = mapped_column(Integer)
428
429
  deployment_num: Mapped[int] = mapped_column(Integer)
429
430
  job_runtime_data: Mapped[Optional[str]] = mapped_column(Text)
431
+ probes: Mapped[list["ProbeModel"]] = relationship(
432
+ back_populates="job", order_by="ProbeModel.probe_num"
433
+ )
430
434
 
431
435
 
432
436
  class GatewayModel(BaseModel):
@@ -596,7 +600,11 @@ class InstanceModel(BaseModel):
596
600
  # instance termination handling
597
601
  termination_deadline: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
598
602
  termination_reason: Mapped[Optional[str]] = mapped_column(String(4000))
603
+ # Deprecated since 0.19.22, not used
599
604
  health_status: Mapped[Optional[str]] = mapped_column(String(4000))
605
+ health: Mapped[HealthStatus] = mapped_column(
606
+ EnumAsString(HealthStatus, 100), default=HealthStatus.HEALTHY
607
+ )
600
608
  first_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
601
609
  last_termination_retry_at: Mapped[Optional[datetime]] = mapped_column(NaiveDateTime)
602
610
 
@@ -627,6 +635,21 @@ class InstanceModel(BaseModel):
627
635
  )
628
636
 
629
637
 
638
+ class InstanceHealthCheckModel(BaseModel):
639
+ __tablename__ = "instance_health_checks"
640
+
641
+ id: Mapped[uuid.UUID] = mapped_column(
642
+ UUIDType(binary=False), primary_key=True, default=uuid.uuid4
643
+ )
644
+
645
+ instance_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("instances.id"))
646
+ instance: Mapped["InstanceModel"] = relationship()
647
+
648
+ collected_at: Mapped[datetime] = mapped_column(NaiveDateTime)
649
+ status: Mapped[HealthStatus] = mapped_column(EnumAsString(HealthStatus, 100))
650
+ response: Mapped[str] = mapped_column(Text)
651
+
652
+
630
653
  class VolumeModel(BaseModel):
631
654
  __tablename__ = "volumes"
632
655
 
@@ -729,6 +752,24 @@ class JobPrometheusMetrics(BaseModel):
729
752
  text: Mapped[str] = mapped_column(Text)
730
753
 
731
754
 
755
+ class ProbeModel(BaseModel):
756
+ __tablename__ = "probes"
757
+ __table_args__ = (UniqueConstraint("job_id", "probe_num", name="uq_probes_job_id_probe_num"),)
758
+
759
+ id: Mapped[uuid.UUID] = mapped_column(
760
+ UUIDType(binary=False), primary_key=True, default=uuid.uuid4
761
+ )
762
+ name: Mapped[str] = mapped_column(String(100))
763
+
764
+ job_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("jobs.id"), primary_key=True)
765
+ job: Mapped["JobModel"] = relationship(back_populates="probes")
766
+
767
+ probe_num: Mapped[int] = mapped_column(Integer) # index in JobSpec.probes
768
+ due: Mapped[datetime] = mapped_column(NaiveDateTime)
769
+ success_streak: Mapped[int] = mapped_column(BigInteger)
770
+ active: Mapped[bool] = mapped_column(Boolean)
771
+
772
+
732
773
  class SecretModel(BaseModel):
733
774
  __tablename__ = "secrets"
734
775
  __table_args__ = (UniqueConstraint("project_id", "name", name="uq_secrets_project_id_name"),)
@@ -3,12 +3,16 @@ from typing import List
3
3
  from fastapi import APIRouter, Depends
4
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
5
 
6
- import dstack._internal.server.services.instances as instances
6
+ import dstack._internal.server.services.instances as instances_services
7
7
  from dstack._internal.core.models.instances import Instance
8
8
  from dstack._internal.server.db import get_session
9
- from dstack._internal.server.models import UserModel
10
- from dstack._internal.server.schemas.instances import ListInstancesRequest
11
- from dstack._internal.server.security.permissions import Authenticated
9
+ from dstack._internal.server.models import ProjectModel, UserModel
10
+ from dstack._internal.server.schemas.instances import (
11
+ GetInstanceHealthChecksRequest,
12
+ GetInstanceHealthChecksResponse,
13
+ ListInstancesRequest,
14
+ )
15
+ from dstack._internal.server.security.permissions import Authenticated, ProjectMember
12
16
  from dstack._internal.server.utils.routers import (
13
17
  CustomORJSONResponse,
14
18
  get_base_api_additional_responses,
@@ -19,6 +23,11 @@ root_router = APIRouter(
19
23
  tags=["instances"],
20
24
  responses=get_base_api_additional_responses(),
21
25
  )
26
+ project_router = APIRouter(
27
+ prefix="/api/project/{project_name}/instances",
28
+ tags=["instances"],
29
+ responses=get_base_api_additional_responses(),
30
+ )
22
31
 
23
32
 
24
33
  @root_router.post("/list", response_model=List[Instance])
@@ -35,7 +44,7 @@ async def list_instances(
35
44
  the last instance from the previous page as `prev_created_at` and `prev_id`.
36
45
  """
37
46
  return CustomORJSONResponse(
38
- await instances.list_user_instances(
47
+ await instances_services.list_user_instances(
39
48
  session=session,
40
49
  user=user,
41
50
  project_names=body.project_names,
@@ -47,3 +56,22 @@ async def list_instances(
47
56
  ascending=body.ascending,
48
57
  )
49
58
  )
59
+
60
+
61
+ @project_router.post("/get_instance_health_checks", response_model=GetInstanceHealthChecksResponse)
62
+ async def get_instance_health_checks(
63
+ body: GetInstanceHealthChecksRequest,
64
+ session: AsyncSession = Depends(get_session),
65
+ user_project: tuple[UserModel, ProjectModel] = Depends(ProjectMember()),
66
+ ):
67
+ _, project = user_project
68
+ health_checks = await instances_services.get_instance_health_checks(
69
+ session=session,
70
+ project=project,
71
+ fleet_name=body.fleet_name,
72
+ instance_num=body.instance_num,
73
+ after=body.after,
74
+ before=body.before,
75
+ limit=body.limit,
76
+ )
77
+ return CustomORJSONResponse(GetInstanceHealthChecksResponse(health_checks=health_checks))
@@ -0,0 +1,56 @@
1
+ from enum import IntEnum
2
+
3
+ from dstack._internal.core.models.common import CoreModel
4
+ from dstack._internal.core.models.health import HealthStatus
5
+
6
+
7
+ class DCGMHealthResult(IntEnum):
8
+ """
9
+ `dcgmHealthWatchResult_enum`
10
+
11
+ See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/const.go#L1020-L1026
12
+ """
13
+
14
+ DCGM_HEALTH_RESULT_PASS = 0
15
+ DCGM_HEALTH_RESULT_WARN = 10
16
+ DCGM_HEALTH_RESULT_FAIL = 20
17
+
18
+ def to_health_status(self) -> HealthStatus:
19
+ if self == self.DCGM_HEALTH_RESULT_PASS:
20
+ return HealthStatus.HEALTHY
21
+ if self == self.DCGM_HEALTH_RESULT_WARN:
22
+ return HealthStatus.WARNING
23
+ if self == self.DCGM_HEALTH_RESULT_FAIL:
24
+ return HealthStatus.FAILURE
25
+ raise AssertionError("should not reach here")
26
+
27
+
28
+ class DCGMHealthIncident(CoreModel):
29
+ """
30
+ Flattened `dcgmIncidentInfo_t`
31
+
32
+ See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L68-L73
33
+ """
34
+
35
+ # dcgmIncidentInfo_t
36
+ system: int
37
+ health: DCGMHealthResult
38
+
39
+ # dcgmDiagErrorDetail_t
40
+ error_message: str
41
+ error_code: int
42
+
43
+ # dcgmGroupEntityPair_t
44
+ entity_group_id: int # dcgmGroupEntityPair_t
45
+ entity_id: int
46
+
47
+
48
+ class DCGMHealthResponse(CoreModel):
49
+ """
50
+ `dcgmHealthResponse_v5`
51
+
52
+ See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L75-L78
53
+ """
54
+
55
+ overall_health: DCGMHealthResult
56
+ incidents: list[DCGMHealthIncident]
@@ -3,6 +3,8 @@ from typing import Optional
3
3
  from uuid import UUID
4
4
 
5
5
  from dstack._internal.core.models.common import CoreModel
6
+ from dstack._internal.core.models.health import HealthCheck, HealthStatus
7
+ from dstack._internal.server.schemas.runner import InstanceHealthResponse
6
8
 
7
9
 
8
10
  class ListInstancesRequest(CoreModel):
@@ -13,3 +15,33 @@ class ListInstancesRequest(CoreModel):
13
15
  prev_id: Optional[UUID] = None
14
16
  limit: int = 1000
15
17
  ascending: bool = False
18
+
19
+
20
+ class InstanceCheck(CoreModel):
21
+ reachable: bool
22
+ message: Optional[str] = None
23
+ health_response: Optional[InstanceHealthResponse] = None
24
+
25
+ def get_health_status(self) -> HealthStatus:
26
+ if self.health_response is None:
27
+ return HealthStatus.HEALTHY
28
+ if self.health_response.dcgm is None:
29
+ return HealthStatus.HEALTHY
30
+ return self.health_response.dcgm.overall_health.to_health_status()
31
+
32
+ def has_health_checks(self) -> bool:
33
+ if self.health_response is None:
34
+ return False
35
+ return self.health_response.dcgm is not None
36
+
37
+
38
+ class GetInstanceHealthChecksRequest(CoreModel):
39
+ fleet_name: str
40
+ instance_num: int
41
+ after: Optional[datetime] = None
42
+ before: Optional[datetime] = None
43
+ limit: Optional[int] = None
44
+
45
+
46
+ class GetInstanceHealthChecksResponse(CoreModel):
47
+ health_checks: list[HealthCheck]
@@ -16,6 +16,7 @@ from dstack._internal.core.models.runs import (
16
16
  RunSpec,
17
17
  )
18
18
  from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
19
+ from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
19
20
 
20
21
 
21
22
  class JobStateEvent(CoreModel):
@@ -114,6 +115,10 @@ class HealthcheckResponse(CoreModel):
114
115
  version: str
115
116
 
116
117
 
118
+ class InstanceHealthResponse(CoreModel):
119
+ dcgm: Optional[DCGMHealthResponse] = None
120
+
121
+
117
122
  class GPUMetrics(CoreModel):
118
123
  gpu_memory_usage_bytes: int
119
124
  gpu_util_percent: int
@@ -1,3 +1,4 @@
1
+ import operator
1
2
  import uuid
2
3
  from collections.abc import Container, Iterable
3
4
  from datetime import datetime
@@ -6,15 +7,17 @@ from typing import Dict, List, Literal, Optional, Union
6
7
  import gpuhunt
7
8
  from sqlalchemy import and_, or_, select
8
9
  from sqlalchemy.ext.asyncio import AsyncSession
9
- from sqlalchemy.orm import joinedload
10
+ from sqlalchemy.orm import joinedload, load_only
10
11
 
11
12
  from dstack._internal.core.backends.base.offers import (
12
13
  offer_to_catalog_item,
13
14
  requirements_to_query_filter,
14
15
  )
15
16
  from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
17
+ from dstack._internal.core.errors import ResourceNotExistsError
16
18
  from dstack._internal.core.models.backends.base import BackendType
17
19
  from dstack._internal.core.models.envs import Env
20
+ from dstack._internal.core.models.health import HealthCheck, HealthEvent, HealthStatus
18
21
  from dstack._internal.core.models.instances import (
19
22
  Instance,
20
23
  InstanceAvailability,
@@ -38,10 +41,13 @@ from dstack._internal.core.models.volumes import Volume
38
41
  from dstack._internal.core.services.profiles import get_termination
39
42
  from dstack._internal.server.models import (
40
43
  FleetModel,
44
+ InstanceHealthCheckModel,
41
45
  InstanceModel,
42
46
  ProjectModel,
43
47
  UserModel,
44
48
  )
49
+ from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
50
+ from dstack._internal.server.schemas.runner import InstanceHealthResponse
45
51
  from dstack._internal.server.services.offers import generate_shared_offer
46
52
  from dstack._internal.server.services.projects import list_user_project_models
47
53
  from dstack._internal.utils import common as common_utils
@@ -50,6 +56,57 @@ from dstack._internal.utils.logging import get_logger
50
56
  logger = get_logger(__name__)
51
57
 
52
58
 
59
+ async def get_instance_health_checks(
60
+ session: AsyncSession,
61
+ project: ProjectModel,
62
+ fleet_name: str,
63
+ instance_num: int,
64
+ after: Optional[datetime] = None,
65
+ before: Optional[datetime] = None,
66
+ limit: Optional[int] = None,
67
+ ) -> list[HealthCheck]:
68
+ """
69
+ Returns instance health checks ordered from the latest to the earliest.
70
+
71
+ Expected usage:
72
+ * limit=100 — get the latest 100 checks
73
+ * after=<now - 1 hour> — get checks for the last hour
74
+ * before=<earliest timestamp from the last batch>, limit=100 ­— paginate back in history
75
+ """
76
+ res = await session.execute(
77
+ select(InstanceModel)
78
+ .join(FleetModel)
79
+ .where(
80
+ ~InstanceModel.deleted,
81
+ InstanceModel.project_id == project.id,
82
+ InstanceModel.instance_num == instance_num,
83
+ FleetModel.name == fleet_name,
84
+ )
85
+ .options(load_only(InstanceModel.id))
86
+ )
87
+ instance = res.scalar_one_or_none()
88
+ if instance is None:
89
+ raise ResourceNotExistsError()
90
+
91
+ stmt = (
92
+ select(InstanceHealthCheckModel)
93
+ .where(InstanceHealthCheckModel.instance_id == instance.id)
94
+ .order_by(InstanceHealthCheckModel.collected_at.desc())
95
+ )
96
+ if after is not None:
97
+ stmt = stmt.where(InstanceHealthCheckModel.collected_at > after)
98
+ if before is not None:
99
+ stmt = stmt.where(InstanceHealthCheckModel.collected_at < before)
100
+ if limit is not None:
101
+ stmt = stmt.limit(limit)
102
+ health_checks: list[HealthCheck] = []
103
+ res = await session.execute(stmt)
104
+ for health_check_model in res.scalars():
105
+ health_check = instance_health_check_model_to_health_check(health_check_model)
106
+ health_checks.append(health_check)
107
+ return health_checks
108
+
109
+
53
110
  def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
54
111
  instance = Instance(
55
112
  id=instance_model.id,
@@ -60,6 +117,7 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
60
117
  instance_num=instance_model.instance_num,
61
118
  status=instance_model.status,
62
119
  unreachable=instance_model.unreachable,
120
+ health_status=instance_model.health,
63
121
  termination_reason=instance_model.termination_reason,
64
122
  created=instance_model.created_at,
65
123
  total_blocks=instance_model.total_blocks,
@@ -81,6 +139,48 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
81
139
  return instance
82
140
 
83
141
 
142
+ def instance_health_check_model_to_health_check(model: InstanceHealthCheckModel) -> HealthCheck:
143
+ collected_at = model.collected_at
144
+ status = HealthStatus.HEALTHY
145
+ events: list[HealthEvent] = []
146
+ instance_health_response = get_instance_health_response(model)
147
+ if (dcgm := instance_health_response.dcgm) is not None:
148
+ dcgm_health_check = dcgm_health_response_to_health_check(dcgm, collected_at)
149
+ status = dcgm_health_check.status
150
+ events.extend(dcgm_health_check.events)
151
+ events.sort(key=operator.attrgetter("timestamp"), reverse=True)
152
+ return HealthCheck(
153
+ collected_at=collected_at,
154
+ status=status,
155
+ events=events,
156
+ )
157
+
158
+
159
+ def dcgm_health_response_to_health_check(
160
+ response: DCGMHealthResponse, collected_at: datetime
161
+ ) -> HealthCheck:
162
+ events: list[HealthEvent] = []
163
+ for incident in response.incidents:
164
+ events.append(
165
+ HealthEvent(
166
+ timestamp=collected_at,
167
+ status=incident.health.to_health_status(),
168
+ message=incident.error_message,
169
+ )
170
+ )
171
+ return HealthCheck(
172
+ collected_at=collected_at,
173
+ status=response.overall_health.to_health_status(),
174
+ events=events,
175
+ )
176
+
177
+
178
+ def get_instance_health_response(
179
+ instance_health_check_model: InstanceHealthCheckModel,
180
+ ) -> InstanceHealthResponse:
181
+ return InstanceHealthResponse.__response__.parse_raw(instance_health_check_model.response)
182
+
183
+
84
184
  def get_instance_provisioning_data(instance_model: InstanceModel) -> Optional[JobProvisioningData]:
85
185
  if instance_model.job_provisioning_data is None:
86
186
  return None
@@ -194,6 +294,8 @@ def filter_pool_instances(
194
294
  continue
195
295
  if instance.unreachable:
196
296
  continue
297
+ if instance.health.is_failure():
298
+ continue
197
299
  fleet = instance.fleet
198
300
  if profile.fleets is not None and (fleet is None or fleet.name not in profile.fleets):
199
301
  continue
@@ -52,6 +52,7 @@ from dstack._internal.server.services.jobs.configurators.dev import DevEnvironme
52
52
  from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator
53
53
  from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator
54
54
  from dstack._internal.server.services.logging import fmt
55
+ from dstack._internal.server.services.probes import probe_model_to_probe
55
56
  from dstack._internal.server.services.runner import client
56
57
  from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
57
58
  from dstack._internal.server.services.volumes import (
@@ -115,7 +116,9 @@ async def get_run_job_model(
115
116
  return res.scalar_one_or_none()
116
117
 
117
118
 
118
- def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
119
+ def job_model_to_job_submission(
120
+ job_model: JobModel, include_probes: bool = False
121
+ ) -> JobSubmission:
119
122
  job_provisioning_data = get_job_provisioning_data(job_model)
120
123
  if job_provisioning_data is not None:
121
124
  # TODO remove after transitioning to computed fields
@@ -136,6 +139,9 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
136
139
  finished_at = last_processed_at
137
140
  status_message = _get_job_status_message(job_model)
138
141
  error = _get_job_error(job_model)
142
+ probes = []
143
+ if include_probes:
144
+ probes = [probe_model_to_probe(pm) for pm in job_model.probes]
139
145
  return JobSubmission(
140
146
  id=job_model.id,
141
147
  submission_num=job_model.submission_num,
@@ -152,6 +158,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
152
158
  job_provisioning_data=job_provisioning_data,
153
159
  job_runtime_data=get_job_runtime_data(job_model),
154
160
  error=error,
161
+ probes=probes,
155
162
  )
156
163
 
157
164
 
@@ -11,8 +11,14 @@ from dstack._internal import settings
11
11
  from dstack._internal.core.errors import DockerRegistryError, ServerClientError
12
12
  from dstack._internal.core.models.common import RegistryAuth
13
13
  from dstack._internal.core.models.configurations import (
14
+ DEFAULT_PROBE_INTERVAL,
15
+ DEFAULT_PROBE_METHOD,
16
+ DEFAULT_PROBE_READY_AFTER,
17
+ DEFAULT_PROBE_TIMEOUT,
18
+ DEFAULT_PROBE_URL,
14
19
  DEFAULT_REPO_DIR,
15
20
  PortMapping,
21
+ ProbeConfig,
16
22
  PythonVersion,
17
23
  RunConfigurationType,
18
24
  ServiceConfiguration,
@@ -26,6 +32,7 @@ from dstack._internal.core.models.runs import (
26
32
  AppSpec,
27
33
  JobSpec,
28
34
  JobSSHKey,
35
+ ProbeSpec,
29
36
  Requirements,
30
37
  Retry,
31
38
  RunSpec,
@@ -155,6 +162,7 @@ class JobConfigurator(ABC):
155
162
  repo_code_hash=self.run_spec.repo_code_hash,
156
163
  file_archives=self.run_spec.file_archives,
157
164
  service_port=self._service_port(),
165
+ probes=self._probes(),
158
166
  )
159
167
  return job_spec
160
168
 
@@ -313,6 +321,11 @@ class JobConfigurator(ABC):
313
321
  return self.run_spec.configuration.port.container_port
314
322
  return None
315
323
 
324
+ def _probes(self) -> list[ProbeSpec]:
325
+ if isinstance(self.run_spec.configuration, ServiceConfiguration):
326
+ return list(map(_probe_config_to_spec, self.run_spec.configuration.probes))
327
+ return []
328
+
316
329
 
317
330
  def interpolate_job_volumes(
318
331
  run_volumes: List[Union[MountPoint, str]],
@@ -353,6 +366,19 @@ def interpolate_job_volumes(
353
366
  return job_volumes
354
367
 
355
368
 
369
+ def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec:
370
+ return ProbeSpec(
371
+ type=c.type,
372
+ url=c.url if c.url is not None else DEFAULT_PROBE_URL,
373
+ timeout=c.timeout if c.timeout is not None else DEFAULT_PROBE_TIMEOUT,
374
+ interval=c.interval if c.interval is not None else DEFAULT_PROBE_INTERVAL,
375
+ ready_after=c.ready_after if c.ready_after is not None else DEFAULT_PROBE_READY_AFTER,
376
+ method=c.method if c.method is not None else DEFAULT_PROBE_METHOD,
377
+ headers=c.headers,
378
+ body=c.body,
379
+ )
380
+
381
+
356
382
  def _join_shell_commands(commands: List[str]) -> str:
357
383
  for i, cmd in enumerate(commands):
358
384
  cmd = cmd.strip()
@@ -1,9 +1,9 @@
1
1
  from typing import Union
2
2
 
3
- from dstack._internal.server.models import GatewayModel, JobModel, RunModel
3
+ from dstack._internal.server.models import GatewayModel, JobModel, ProbeModel, RunModel
4
4
 
5
5
 
6
- def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
6
+ def fmt(model: Union[RunModel, JobModel, GatewayModel, ProbeModel]) -> str:
7
7
  """Consistent string representation of a model for logging."""
8
8
  if isinstance(model, RunModel):
9
9
  return f"run({model.id.hex[:6]}){model.run_name}"
@@ -11,4 +11,6 @@ def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
11
11
  return f"job({model.id.hex[:6]}){model.job_name}"
12
12
  if isinstance(model, GatewayModel):
13
13
  return f"gateway({model.id.hex[:6]}){model.name}"
14
+ if isinstance(model, ProbeModel):
15
+ return f"probe({model.id.hex[:6]}){model.name}"
14
16
  return str(model)
@@ -1,5 +1,7 @@
1
1
  import itertools
2
2
  import operator
3
+ import urllib
4
+ import urllib.parse
3
5
  from contextlib import contextmanager
4
6
  from datetime import datetime, timedelta, timezone
5
7
  from typing import Iterator, List, Optional, Set, Tuple, TypedDict
@@ -64,6 +66,7 @@ class CloudWatchLogStorage(LogStorage):
64
66
  self._client = session.client("logs")
65
67
  self._check_group_exists(group)
66
68
  self._group = group
69
+ self._region = self._client.meta.region_name
67
70
  # Stores names of already created streams.
68
71
  # XXX: This set acts as an unbound cache. If this becomes a problem (in case of _very_ long
69
72
  # running server and/or lots of jobs, consider replacing it with an LRU cache, e.g.,
@@ -103,7 +106,11 @@ class CloudWatchLogStorage(LogStorage):
103
106
  )
104
107
  for cw_event in cw_events
105
108
  ]
106
- return JobSubmissionLogs(logs=logs, next_token=next_token)
109
+ return JobSubmissionLogs(
110
+ logs=logs,
111
+ external_url=self._get_stream_external_url(stream),
112
+ next_token=next_token,
113
+ )
107
114
 
108
115
  def _get_log_events_with_retry(
109
116
  self, stream: str, request: PollLogsRequest
@@ -181,6 +188,11 @@ class CloudWatchLogStorage(LogStorage):
181
188
 
182
189
  return events, next_token
183
190
 
191
+ def _get_stream_external_url(self, stream: str) -> str:
192
+ quoted_group = urllib.parse.quote(self._group, safe="")
193
+ quoted_stream = urllib.parse.quote(stream, safe="")
194
+ return f"https://console.aws.amazon.com/cloudwatch/home?region={self._region}#logsV2:log-groups/log-group/{quoted_group}/log-events/{quoted_stream}"
195
+
184
196
  def write_logs(
185
197
  self,
186
198
  project: ProjectModel,
@@ -1,3 +1,4 @@
1
+ import urllib.parse
1
2
  from typing import List
2
3
  from uuid import UUID
3
4
 
@@ -48,6 +49,7 @@ class GCPLogStorage(LogStorage):
48
49
  # (https://cloud.google.com/logging/docs/analyze/custom-index).
49
50
 
50
51
  def __init__(self, project_id: str):
52
+ self.project_id = project_id
51
53
  try:
52
54
  self.client = logging_v2.Client(project=project_id)
53
55
  self.logger = self.client.logger(name=self.LOG_NAME)
@@ -106,7 +108,11 @@ class GCPLogStorage(LogStorage):
106
108
  "GCP Logging read request limit exceeded."
107
109
  " It's recommended to increase default entries.list request quota from 60 per minute."
108
110
  )
109
- return JobSubmissionLogs(logs=logs, next_token=next_token if len(logs) > 0 else None)
111
+ return JobSubmissionLogs(
112
+ logs=logs,
113
+ external_url=self._get_stream_extrnal_url(stream_name),
114
+ next_token=next_token if len(logs) > 0 else None,
115
+ )
110
116
 
111
117
  def write_logs(
112
118
  self,
@@ -162,3 +168,12 @@ class GCPLogStorage(LogStorage):
162
168
  self, project_name: str, run_name: str, job_submission_id: UUID, producer: LogProducer
163
169
  ) -> str:
164
170
  return f"{project_name}-{run_name}-{job_submission_id}-{producer.value}"
171
+
172
+ def _get_stream_extrnal_url(self, stream_name: str) -> str:
173
+ log_name_resource_name = self._get_log_name_resource_name()
174
+ query = f'logName="{log_name_resource_name}" AND labels.stream="{stream_name}"'
175
+ quoted_query = urllib.parse.quote(query, safe="")
176
+ return f"https://console.cloud.google.com/logs/query;query={quoted_query}?project={self.project_id}"
177
+
178
+ def _get_log_name_resource_name(self) -> str:
179
+ return f"projects/{self.project_id}/logs/{self.LOG_NAME}"
@@ -0,0 +1,6 @@
1
+ from dstack._internal.core.models.runs import Probe
2
+ from dstack._internal.server.models import ProbeModel
3
+
4
+
5
+ def probe_model_to_probe(probe_model: ProbeModel) -> Probe:
6
+ return Probe(success_streak=probe_model.success_streak)
@@ -197,6 +197,10 @@ async def set_project_members(
197
197
  project: ProjectModel,
198
198
  members: List[MemberSetting],
199
199
  ):
200
+ usernames = {m.username for m in members}
201
+ if len(usernames) != len(members):
202
+ raise ServerClientError("Cannot add same user multiple times")
203
+
200
204
  project = await get_project_model_by_name_or_error(
201
205
  session=session,
202
206
  project_name=project.name,
@@ -245,6 +249,10 @@ async def add_project_members(
245
249
  members: List[MemberSetting],
246
250
  ):
247
251
  """Add multiple members to a project."""
252
+ usernames = {m.username for m in members}
253
+ if len(usernames) != len(members):
254
+ raise ServerClientError("Cannot add same user multiple times")
255
+
248
256
  project = await get_project_model_by_name_or_error(
249
257
  session=session,
250
258
  project_name=project.name,
@@ -259,7 +267,10 @@ async def add_project_members(
259
267
  )
260
268
 
261
269
  if not is_self_join_to_public:
262
- if requesting_user_role not in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
270
+ if user.global_role != GlobalRole.ADMIN and requesting_user_role not in [
271
+ ProjectRole.ADMIN,
272
+ ProjectRole.MANAGER,
273
+ ]:
263
274
  raise ForbiddenError("Access denied: insufficient permissions to add members")
264
275
 
265
276
  if user.global_role != GlobalRole.ADMIN and requesting_user_role == ProjectRole.MANAGER:
@@ -272,8 +283,6 @@ async def add_project_members(
272
283
  if members[0].project_role != ProjectRole.USER:
273
284
  raise ForbiddenError("Access denied: can only join public projects as user role")
274
285
 
275
- usernames = [member.username for member in members]
276
-
277
286
  res = await session.execute(
278
287
  select(UserModel).where((UserModel.name.in_(usernames)) | (UserModel.email.in_(usernames)))
279
288
  )
@@ -628,7 +637,10 @@ async def remove_project_members(
628
637
  )
629
638
 
630
639
  if not is_self_leave:
631
- if requesting_user_role not in [ProjectRole.ADMIN, ProjectRole.MANAGER]:
640
+ if user.global_role != GlobalRole.ADMIN and requesting_user_role not in [
641
+ ProjectRole.ADMIN,
642
+ ProjectRole.MANAGER,
643
+ ]:
632
644
  raise ForbiddenError("Access denied: insufficient permissions to remove members")
633
645
 
634
646
  res = await session.execute(