dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/__init__.py +0 -65
  11. dstack/_internal/core/backends/configurators.py +9 -0
  12. dstack/_internal/core/backends/features.py +64 -0
  13. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  14. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  15. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  16. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  17. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  18. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  19. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  20. dstack/_internal/core/backends/models.py +8 -0
  21. dstack/_internal/core/compatibility/fleets.py +2 -0
  22. dstack/_internal/core/compatibility/runs.py +12 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/profiles.py +37 -0
  29. dstack/_internal/core/models/runs.py +21 -1
  30. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  31. dstack/_internal/server/app.py +26 -10
  32. dstack/_internal/server/background/__init__.py +9 -6
  33. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  34. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  35. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  36. dstack/_internal/server/background/tasks/process_instances.py +168 -103
  37. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  38. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  39. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  40. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  41. dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
  42. dstack/_internal/server/background/tasks/process_runs.py +84 -34
  43. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  44. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  45. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  46. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  47. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  48. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  49. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  50. dstack/_internal/server/models.py +57 -16
  51. dstack/_internal/server/routers/instances.py +33 -5
  52. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  53. dstack/_internal/server/schemas/instances.py +32 -0
  54. dstack/_internal/server/schemas/runner.py +5 -0
  55. dstack/_internal/server/services/fleets.py +19 -10
  56. dstack/_internal/server/services/gateways/__init__.py +17 -17
  57. dstack/_internal/server/services/instances.py +113 -15
  58. dstack/_internal/server/services/jobs/__init__.py +18 -13
  59. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  60. dstack/_internal/server/services/logging.py +4 -2
  61. dstack/_internal/server/services/logs/aws.py +13 -1
  62. dstack/_internal/server/services/logs/gcp.py +16 -1
  63. dstack/_internal/server/services/offers.py +3 -3
  64. dstack/_internal/server/services/probes.py +6 -0
  65. dstack/_internal/server/services/projects.py +51 -19
  66. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  67. dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
  68. dstack/_internal/server/services/runner/client.py +52 -20
  69. dstack/_internal/server/services/runner/ssh.py +4 -4
  70. dstack/_internal/server/services/runs.py +115 -39
  71. dstack/_internal/server/services/services/__init__.py +4 -1
  72. dstack/_internal/server/services/ssh.py +66 -0
  73. dstack/_internal/server/services/users.py +2 -3
  74. dstack/_internal/server/services/volumes.py +11 -11
  75. dstack/_internal/server/settings.py +16 -0
  76. dstack/_internal/server/statics/index.html +1 -1
  77. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  78. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  79. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  80. dstack/_internal/server/testing/common.py +51 -0
  81. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  82. dstack/_internal/server/utils/sentry_utils.py +12 -0
  83. dstack/_internal/settings.py +3 -0
  84. dstack/_internal/utils/common.py +15 -0
  85. dstack/_internal/utils/cron.py +5 -0
  86. dstack/api/server/__init__.py +1 -1
  87. dstack/version.py +1 -1
  88. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
  89. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
  90. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  91. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
  92. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
  93. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,56 @@
1
+ from enum import IntEnum
2
+
3
+ from dstack._internal.core.models.common import CoreModel
4
+ from dstack._internal.core.models.health import HealthStatus
5
+
6
+
7
+ class DCGMHealthResult(IntEnum):
8
+ """
9
+ `dcgmHealthWatchResult_enum`
10
+
11
+ See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/const.go#L1020-L1026
12
+ """
13
+
14
+ DCGM_HEALTH_RESULT_PASS = 0
15
+ DCGM_HEALTH_RESULT_WARN = 10
16
+ DCGM_HEALTH_RESULT_FAIL = 20
17
+
18
+ def to_health_status(self) -> HealthStatus:
19
+ if self == self.DCGM_HEALTH_RESULT_PASS:
20
+ return HealthStatus.HEALTHY
21
+ if self == self.DCGM_HEALTH_RESULT_WARN:
22
+ return HealthStatus.WARNING
23
+ if self == self.DCGM_HEALTH_RESULT_FAIL:
24
+ return HealthStatus.FAILURE
25
+ raise AssertionError("should not reach here")
26
+
27
+
28
+ class DCGMHealthIncident(CoreModel):
29
+ """
30
+ Flattened `dcgmIncidentInfo_t`
31
+
32
+ See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L68-L73
33
+ """
34
+
35
+ # dcgmIncidentInfo_t
36
+ system: int
37
+ health: DCGMHealthResult
38
+
39
+ # dcgmDiagErrorDetail_t
40
+ error_message: str
41
+ error_code: int
42
+
43
+ # dcgmGroupEntityPair_t
44
+ entity_group_id: int # dcgmGroupEntityPair_t
45
+ entity_id: int
46
+
47
+
48
+ class DCGMHealthResponse(CoreModel):
49
+ """
50
+ `dcgmHealthResponse_v5`
51
+
52
+ See: https://github.com/NVIDIA/go-dcgm/blob/85ceb31/pkg/dcgm/health.go#L75-L78
53
+ """
54
+
55
+ overall_health: DCGMHealthResult
56
+ incidents: list[DCGMHealthIncident]
@@ -3,6 +3,8 @@ from typing import Optional
3
3
  from uuid import UUID
4
4
 
5
5
  from dstack._internal.core.models.common import CoreModel
6
+ from dstack._internal.core.models.health import HealthCheck, HealthStatus
7
+ from dstack._internal.server.schemas.runner import InstanceHealthResponse
6
8
 
7
9
 
8
10
  class ListInstancesRequest(CoreModel):
@@ -13,3 +15,33 @@ class ListInstancesRequest(CoreModel):
13
15
  prev_id: Optional[UUID] = None
14
16
  limit: int = 1000
15
17
  ascending: bool = False
18
+
19
+
20
+ class InstanceCheck(CoreModel):
21
+ reachable: bool
22
+ message: Optional[str] = None
23
+ health_response: Optional[InstanceHealthResponse] = None
24
+
25
+ def get_health_status(self) -> HealthStatus:
26
+ if self.health_response is None:
27
+ return HealthStatus.HEALTHY
28
+ if self.health_response.dcgm is None:
29
+ return HealthStatus.HEALTHY
30
+ return self.health_response.dcgm.overall_health.to_health_status()
31
+
32
+ def has_health_checks(self) -> bool:
33
+ if self.health_response is None:
34
+ return False
35
+ return self.health_response.dcgm is not None
36
+
37
+
38
+ class GetInstanceHealthChecksRequest(CoreModel):
39
+ fleet_name: str
40
+ instance_num: int
41
+ after: Optional[datetime] = None
42
+ before: Optional[datetime] = None
43
+ limit: Optional[int] = None
44
+
45
+
46
+ class GetInstanceHealthChecksResponse(CoreModel):
47
+ health_checks: list[HealthCheck]
@@ -16,6 +16,7 @@ from dstack._internal.core.models.runs import (
16
16
  RunSpec,
17
17
  )
18
18
  from dstack._internal.core.models.volumes import InstanceMountPoint, VolumeMountPoint
19
+ from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
19
20
 
20
21
 
21
22
  class JobStateEvent(CoreModel):
@@ -114,6 +115,10 @@ class HealthcheckResponse(CoreModel):
114
115
  version: str
115
116
 
116
117
 
118
+ class InstanceHealthResponse(CoreModel):
119
+ dcgm: Optional[DCGMHealthResponse] = None
120
+
121
+
117
122
  class GPUMetrics(CoreModel):
118
123
  gpu_memory_usage_bytes: int
119
124
  gpu_util_percent: int
@@ -1,6 +1,6 @@
1
1
  import uuid
2
2
  from collections.abc import Callable
3
- from datetime import datetime, timezone
3
+ from datetime import datetime
4
4
  from functools import wraps
5
5
  from typing import List, Literal, Optional, Tuple, TypeVar, Union, cast
6
6
 
@@ -8,8 +8,8 @@ from sqlalchemy import and_, func, or_, select
8
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
9
  from sqlalchemy.orm import joinedload, selectinload
10
10
 
11
- from dstack._internal.core.backends import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
12
11
  from dstack._internal.core.backends.base.backend import Backend
12
+ from dstack._internal.core.backends.features import BACKENDS_WITH_CREATE_INSTANCE_SUPPORT
13
13
  from dstack._internal.core.errors import (
14
14
  ForbiddenError,
15
15
  ResourceExistsError,
@@ -49,6 +49,7 @@ from dstack._internal.server.db import get_db
49
49
  from dstack._internal.server.models import (
50
50
  FleetModel,
51
51
  InstanceModel,
52
+ JobModel,
52
53
  ProjectModel,
53
54
  UserModel,
54
55
  )
@@ -66,7 +67,6 @@ from dstack._internal.server.services.plugins import apply_plugin_policies
66
67
  from dstack._internal.server.services.projects import (
67
68
  get_member,
68
69
  get_member_permissions,
69
- list_project_models,
70
70
  list_user_project_models,
71
71
  )
72
72
  from dstack._internal.server.services.resources import set_resources_defaults
@@ -87,10 +87,11 @@ async def list_fleets(
87
87
  limit: int,
88
88
  ascending: bool,
89
89
  ) -> List[Fleet]:
90
- if user.global_role == GlobalRole.ADMIN:
91
- projects = await list_project_models(session=session)
92
- else:
93
- projects = await list_user_project_models(session=session, user=user)
90
+ projects = await list_user_project_models(
91
+ session=session,
92
+ user=user,
93
+ only_names=True,
94
+ )
94
95
  if project_name is not None:
95
96
  projects = [p for p in projects if p.name == project_name]
96
97
  fleet_models = await list_projects_fleet_models(
@@ -398,7 +399,11 @@ async def apply_plan(
398
399
  FleetModel.id == fleet_model.id,
399
400
  FleetModel.deleted == False,
400
401
  )
401
- .options(selectinload(FleetModel.instances))
402
+ .options(
403
+ selectinload(FleetModel.instances)
404
+ .joinedload(InstanceModel.jobs)
405
+ .load_only(JobModel.id)
406
+ )
402
407
  .options(selectinload(FleetModel.runs))
403
408
  .execution_options(populate_existing=True)
404
409
  .order_by(FleetModel.id) # take locks in order
@@ -563,7 +568,11 @@ async def delete_fleets(
563
568
  FleetModel.name.in_(names),
564
569
  FleetModel.deleted == False,
565
570
  )
566
- .options(selectinload(FleetModel.instances))
571
+ .options(
572
+ selectinload(FleetModel.instances)
573
+ .joinedload(InstanceModel.jobs)
574
+ .load_only(JobModel.id)
575
+ )
567
576
  .options(selectinload(FleetModel.runs))
568
577
  .execution_options(populate_existing=True)
569
578
  .order_by(FleetModel.id) # take locks in order
@@ -600,7 +609,7 @@ def fleet_model_to_fleet(
600
609
  name=fleet_model.name,
601
610
  project_name=fleet_model.project.name,
602
611
  spec=spec,
603
- created_at=fleet_model.created_at.replace(tzinfo=timezone.utc),
612
+ created_at=fleet_model.created_at,
604
613
  status=fleet_model.status,
605
614
  status_message=fleet_model.status_message,
606
615
  instances=instances,
@@ -1,7 +1,7 @@
1
1
  import asyncio
2
2
  import datetime
3
3
  import uuid
4
- from datetime import timedelta, timezone
4
+ from datetime import timedelta
5
5
  from functools import partial
6
6
  from typing import List, Optional, Sequence
7
7
 
@@ -11,16 +11,16 @@ from sqlalchemy.ext.asyncio import AsyncSession
11
11
  from sqlalchemy.orm import selectinload
12
12
 
13
13
  import dstack._internal.utils.random_names as random_names
14
- from dstack._internal.core.backends import (
15
- BACKENDS_WITH_GATEWAY_SUPPORT,
16
- BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
17
- )
18
14
  from dstack._internal.core.backends.base.compute import (
19
15
  Compute,
20
16
  ComputeWithGatewaySupport,
21
17
  get_dstack_gateway_wheel,
22
18
  get_dstack_runner_version,
23
19
  )
20
+ from dstack._internal.core.backends.features import (
21
+ BACKENDS_WITH_GATEWAY_SUPPORT,
22
+ BACKENDS_WITH_PRIVATE_GATEWAY_SUPPORT,
23
+ )
24
24
  from dstack._internal.core.errors import (
25
25
  GatewayError,
26
26
  ResourceNotExistsError,
@@ -86,15 +86,6 @@ async def get_gateway_by_name(
86
86
  return gateway_model_to_gateway(gateway)
87
87
 
88
88
 
89
- async def get_project_default_gateway(
90
- session: AsyncSession, project: ProjectModel
91
- ) -> Optional[Gateway]:
92
- gateway: Optional[GatewayModel] = project.default_gateway
93
- if gateway is None:
94
- return None
95
- return gateway_model_to_gateway(gateway)
96
-
97
-
98
89
  async def create_gateway_compute(
99
90
  project_name: str,
100
91
  backend_compute: Compute,
@@ -181,9 +172,9 @@ async def create_gateway(
181
172
  session.add(gateway)
182
173
  await session.commit()
183
174
 
184
- if project.default_gateway is None or configuration.default:
175
+ default_gateway = await get_project_default_gateway_model(session=session, project=project)
176
+ if default_gateway is None or configuration.default:
185
177
  await set_default_gateway(session=session, project=project, name=configuration.name)
186
-
187
178
  return gateway_model_to_gateway(gateway)
188
179
 
189
180
 
@@ -349,6 +340,15 @@ async def get_project_gateway_model_by_name(
349
340
  return res.scalar()
350
341
 
351
342
 
343
+ async def get_project_default_gateway_model(
344
+ session: AsyncSession, project: ProjectModel
345
+ ) -> Optional[GatewayModel]:
346
+ res = await session.execute(
347
+ select(GatewayModel).where(GatewayModel.id == project.default_gateway_id)
348
+ )
349
+ return res.scalar_one_or_none()
350
+
351
+
352
352
  async def generate_gateway_name(session: AsyncSession, project: ProjectModel) -> str:
353
353
  gateways = await list_project_gateway_models(session=session, project=project)
354
354
  names = {g.name for g in gateways}
@@ -557,7 +557,7 @@ def gateway_model_to_gateway(gateway_model: GatewayModel) -> Gateway:
557
557
  region=gateway_model.region,
558
558
  wildcard_domain=gateway_model.wildcard_domain,
559
559
  default=gateway_model.project.default_gateway_id == gateway_model.id,
560
- created_at=gateway_model.created_at.replace(tzinfo=timezone.utc),
560
+ created_at=gateway_model.created_at,
561
561
  status=gateway_model.status,
562
562
  status_message=gateway_model.status_message,
563
563
  configuration=configuration,
@@ -1,20 +1,23 @@
1
+ import operator
1
2
  import uuid
2
3
  from collections.abc import Container, Iterable
3
- from datetime import datetime, timezone
4
+ from datetime import datetime
4
5
  from typing import Dict, List, Literal, Optional, Union
5
6
 
6
7
  import gpuhunt
7
8
  from sqlalchemy import and_, or_, select
8
9
  from sqlalchemy.ext.asyncio import AsyncSession
9
- from sqlalchemy.orm import joinedload
10
+ from sqlalchemy.orm import joinedload, load_only
10
11
 
11
- from dstack._internal.core.backends import BACKENDS_WITH_MULTINODE_SUPPORT
12
12
  from dstack._internal.core.backends.base.offers import (
13
13
  offer_to_catalog_item,
14
14
  requirements_to_query_filter,
15
15
  )
16
+ from dstack._internal.core.backends.features import BACKENDS_WITH_MULTINODE_SUPPORT
17
+ from dstack._internal.core.errors import ResourceNotExistsError
16
18
  from dstack._internal.core.models.backends.base import BackendType
17
19
  from dstack._internal.core.models.envs import Env
20
+ from dstack._internal.core.models.health import HealthCheck, HealthEvent, HealthStatus
18
21
  from dstack._internal.core.models.instances import (
19
22
  Instance,
20
23
  InstanceAvailability,
@@ -34,23 +37,76 @@ from dstack._internal.core.models.profiles import (
34
37
  TerminationPolicy,
35
38
  )
36
39
  from dstack._internal.core.models.runs import JobProvisioningData, Requirements
37
- from dstack._internal.core.models.users import GlobalRole
38
40
  from dstack._internal.core.models.volumes import Volume
39
41
  from dstack._internal.core.services.profiles import get_termination
40
42
  from dstack._internal.server.models import (
41
43
  FleetModel,
44
+ InstanceHealthCheckModel,
42
45
  InstanceModel,
43
46
  ProjectModel,
44
47
  UserModel,
45
48
  )
49
+ from dstack._internal.server.schemas.health.dcgm import DCGMHealthResponse
50
+ from dstack._internal.server.schemas.runner import InstanceHealthResponse
46
51
  from dstack._internal.server.services.offers import generate_shared_offer
47
- from dstack._internal.server.services.projects import list_project_models, list_user_project_models
52
+ from dstack._internal.server.services.projects import list_user_project_models
48
53
  from dstack._internal.utils import common as common_utils
49
54
  from dstack._internal.utils.logging import get_logger
50
55
 
51
56
  logger = get_logger(__name__)
52
57
 
53
58
 
59
+ async def get_instance_health_checks(
60
+ session: AsyncSession,
61
+ project: ProjectModel,
62
+ fleet_name: str,
63
+ instance_num: int,
64
+ after: Optional[datetime] = None,
65
+ before: Optional[datetime] = None,
66
+ limit: Optional[int] = None,
67
+ ) -> list[HealthCheck]:
68
+ """
69
+ Returns instance health checks ordered from the latest to the earliest.
70
+
71
+ Expected usage:
72
+ * limit=100 — get the latest 100 checks
73
+ * after=<now - 1 hour> — get checks for the last hour
74
+ * before=<earliest timestamp from the last batch>, limit=100 ­— paginate back in history
75
+ """
76
+ res = await session.execute(
77
+ select(InstanceModel)
78
+ .join(FleetModel)
79
+ .where(
80
+ ~InstanceModel.deleted,
81
+ InstanceModel.project_id == project.id,
82
+ InstanceModel.instance_num == instance_num,
83
+ FleetModel.name == fleet_name,
84
+ )
85
+ .options(load_only(InstanceModel.id))
86
+ )
87
+ instance = res.scalar_one_or_none()
88
+ if instance is None:
89
+ raise ResourceNotExistsError()
90
+
91
+ stmt = (
92
+ select(InstanceHealthCheckModel)
93
+ .where(InstanceHealthCheckModel.instance_id == instance.id)
94
+ .order_by(InstanceHealthCheckModel.collected_at.desc())
95
+ )
96
+ if after is not None:
97
+ stmt = stmt.where(InstanceHealthCheckModel.collected_at > after)
98
+ if before is not None:
99
+ stmt = stmt.where(InstanceHealthCheckModel.collected_at < before)
100
+ if limit is not None:
101
+ stmt = stmt.limit(limit)
102
+ health_checks: list[HealthCheck] = []
103
+ res = await session.execute(stmt)
104
+ for health_check_model in res.scalars():
105
+ health_check = instance_health_check_model_to_health_check(health_check_model)
106
+ health_checks.append(health_check)
107
+ return health_checks
108
+
109
+
54
110
  def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
55
111
  instance = Instance(
56
112
  id=instance_model.id,
@@ -61,8 +117,9 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
61
117
  instance_num=instance_model.instance_num,
62
118
  status=instance_model.status,
63
119
  unreachable=instance_model.unreachable,
120
+ health_status=instance_model.health,
64
121
  termination_reason=instance_model.termination_reason,
65
- created=instance_model.created_at.replace(tzinfo=timezone.utc),
122
+ created=instance_model.created_at,
66
123
  total_blocks=instance_model.total_blocks,
67
124
  busy_blocks=instance_model.busy_blocks,
68
125
  )
@@ -82,6 +139,48 @@ def instance_model_to_instance(instance_model: InstanceModel) -> Instance:
82
139
  return instance
83
140
 
84
141
 
142
+ def instance_health_check_model_to_health_check(model: InstanceHealthCheckModel) -> HealthCheck:
143
+ collected_at = model.collected_at
144
+ status = HealthStatus.HEALTHY
145
+ events: list[HealthEvent] = []
146
+ instance_health_response = get_instance_health_response(model)
147
+ if (dcgm := instance_health_response.dcgm) is not None:
148
+ dcgm_health_check = dcgm_health_response_to_health_check(dcgm, collected_at)
149
+ status = dcgm_health_check.status
150
+ events.extend(dcgm_health_check.events)
151
+ events.sort(key=operator.attrgetter("timestamp"), reverse=True)
152
+ return HealthCheck(
153
+ collected_at=collected_at,
154
+ status=status,
155
+ events=events,
156
+ )
157
+
158
+
159
+ def dcgm_health_response_to_health_check(
160
+ response: DCGMHealthResponse, collected_at: datetime
161
+ ) -> HealthCheck:
162
+ events: list[HealthEvent] = []
163
+ for incident in response.incidents:
164
+ events.append(
165
+ HealthEvent(
166
+ timestamp=collected_at,
167
+ status=incident.health.to_health_status(),
168
+ message=incident.error_message,
169
+ )
170
+ )
171
+ return HealthCheck(
172
+ collected_at=collected_at,
173
+ status=response.overall_health.to_health_status(),
174
+ events=events,
175
+ )
176
+
177
+
178
+ def get_instance_health_response(
179
+ instance_health_check_model: InstanceHealthCheckModel,
180
+ ) -> InstanceHealthResponse:
181
+ return InstanceHealthResponse.__response__.parse_raw(instance_health_check_model.response)
182
+
183
+
85
184
  def get_instance_provisioning_data(instance_model: InstanceModel) -> Optional[JobProvisioningData]:
86
185
  if instance_model.job_provisioning_data is None:
87
186
  return None
@@ -195,6 +294,8 @@ def filter_pool_instances(
195
294
  continue
196
295
  if instance.unreachable:
197
296
  continue
297
+ if instance.health.is_failure():
298
+ continue
198
299
  fleet = instance.fleet
199
300
  if profile.fleets is not None and (fleet is None or fleet.name not in profile.fleets):
200
301
  continue
@@ -372,18 +473,15 @@ async def list_user_instances(
372
473
  limit: int,
373
474
  ascending: bool,
374
475
  ) -> List[Instance]:
375
- if user.global_role == GlobalRole.ADMIN:
376
- projects = await list_project_models(session=session)
377
- else:
378
- projects = await list_user_project_models(session=session, user=user)
379
- if not projects:
380
- return []
381
-
476
+ projects = await list_user_project_models(
477
+ session=session,
478
+ user=user,
479
+ only_names=True,
480
+ )
382
481
  if project_names is not None:
383
- projects = [proj for proj in projects if proj.name in project_names]
482
+ projects = [p for p in projects if p.name in project_names]
384
483
  if len(projects) == 0:
385
484
  return []
386
-
387
485
  instance_models = await list_projects_instance_models(
388
486
  session=session,
389
487
  projects=projects,
@@ -1,13 +1,13 @@
1
1
  import itertools
2
2
  import json
3
- from datetime import timedelta, timezone
3
+ from datetime import timedelta
4
4
  from typing import Dict, Iterable, List, Optional, Tuple
5
5
  from uuid import UUID
6
6
 
7
7
  import requests
8
8
  from sqlalchemy import select
9
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
- from sqlalchemy.orm import joinedload
10
+ from sqlalchemy.orm import joinedload, load_only
11
11
 
12
12
  import dstack._internal.server.services.backends as backends_services
13
13
  from dstack._internal.core.backends.base.backend import Backend
@@ -52,6 +52,7 @@ from dstack._internal.server.services.jobs.configurators.dev import DevEnvironme
52
52
  from dstack._internal.server.services.jobs.configurators.service import ServiceJobConfigurator
53
53
  from dstack._internal.server.services.jobs.configurators.task import TaskJobConfigurator
54
54
  from dstack._internal.server.services.logging import fmt
55
+ from dstack._internal.server.services.probes import probe_model_to_probe
55
56
  from dstack._internal.server.services.runner import client
56
57
  from dstack._internal.server.services.runner.ssh import runner_ssh_tunnel
57
58
  from dstack._internal.server.services.volumes import (
@@ -115,7 +116,9 @@ async def get_run_job_model(
115
116
  return res.scalar_one_or_none()
116
117
 
117
118
 
118
- def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
119
+ def job_model_to_job_submission(
120
+ job_model: JobModel, include_probes: bool = False
121
+ ) -> JobSubmission:
119
122
  job_provisioning_data = get_job_provisioning_data(job_model)
120
123
  if job_provisioning_data is not None:
121
124
  # TODO remove after transitioning to computed fields
@@ -130,17 +133,20 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
130
133
  ):
131
134
  backend_data = json.loads(job_provisioning_data.backend_data)
132
135
  job_provisioning_data.backend = backend_data["base_backend"]
133
- last_processed_at = job_model.last_processed_at.replace(tzinfo=timezone.utc)
136
+ last_processed_at = job_model.last_processed_at
134
137
  finished_at = None
135
138
  if job_model.status.is_finished():
136
139
  finished_at = last_processed_at
137
140
  status_message = _get_job_status_message(job_model)
138
141
  error = _get_job_error(job_model)
142
+ probes = []
143
+ if include_probes:
144
+ probes = [probe_model_to_probe(pm) for pm in job_model.probes]
139
145
  return JobSubmission(
140
146
  id=job_model.id,
141
147
  submission_num=job_model.submission_num,
142
148
  deployment_num=job_model.deployment_num,
143
- submitted_at=job_model.submitted_at.replace(tzinfo=timezone.utc),
149
+ submitted_at=job_model.submitted_at,
144
150
  last_processed_at=last_processed_at,
145
151
  finished_at=finished_at,
146
152
  inactivity_secs=job_model.inactivity_secs,
@@ -152,6 +158,7 @@ def job_model_to_job_submission(job_model: JobModel) -> JobSubmission:
152
158
  job_provisioning_data=job_provisioning_data,
153
159
  job_runtime_data=get_job_runtime_data(job_model),
154
160
  error=error,
161
+ probes=probes,
155
162
  )
156
163
 
157
164
 
@@ -231,10 +238,7 @@ async def process_terminating_job(
231
238
  Graceful stop should already be done by `process_terminating_run`.
232
239
  Caller must acquire the locks on the job and the job's instance.
233
240
  """
234
- if (
235
- job_model.remove_at is not None
236
- and job_model.remove_at.replace(tzinfo=timezone.utc) > common.get_current_datetime()
237
- ):
241
+ if job_model.remove_at is not None and job_model.remove_at > common.get_current_datetime():
238
242
  # it's too early to terminate the instance
239
243
  return
240
244
 
@@ -550,24 +554,25 @@ def _should_force_detach_volume(job_model: JobModel, stop_duration: Optional[int
550
554
  return (
551
555
  job_model.volumes_detached_at is not None
552
556
  and common.get_current_datetime()
553
- > job_model.volumes_detached_at.replace(tzinfo=timezone.utc) + MIN_FORCE_DETACH_WAIT_PERIOD
557
+ > job_model.volumes_detached_at + MIN_FORCE_DETACH_WAIT_PERIOD
554
558
  and (
555
559
  job_model.termination_reason == JobTerminationReason.ABORTED_BY_USER
556
560
  or stop_duration is not None
557
561
  and common.get_current_datetime()
558
- > job_model.volumes_detached_at.replace(tzinfo=timezone.utc)
559
- + timedelta(seconds=stop_duration)
562
+ > job_model.volumes_detached_at + timedelta(seconds=stop_duration)
560
563
  )
561
564
  )
562
565
 
563
566
 
564
567
  async def get_instances_ids_with_detaching_volumes(session: AsyncSession) -> List[UUID]:
565
568
  res = await session.execute(
566
- select(JobModel).where(
569
+ select(JobModel)
570
+ .where(
567
571
  JobModel.status == JobStatus.TERMINATING,
568
572
  JobModel.used_instance_id.is_not(None),
569
573
  JobModel.volumes_detached_at.is_not(None),
570
574
  )
575
+ .options(load_only(JobModel.used_instance_id))
571
576
  )
572
577
  job_models = res.scalars().all()
573
578
  return [jm.used_instance_id for jm in job_models if jm.used_instance_id]
@@ -11,8 +11,14 @@ from dstack._internal import settings
11
11
  from dstack._internal.core.errors import DockerRegistryError, ServerClientError
12
12
  from dstack._internal.core.models.common import RegistryAuth
13
13
  from dstack._internal.core.models.configurations import (
14
+ DEFAULT_PROBE_INTERVAL,
15
+ DEFAULT_PROBE_METHOD,
16
+ DEFAULT_PROBE_READY_AFTER,
17
+ DEFAULT_PROBE_TIMEOUT,
18
+ DEFAULT_PROBE_URL,
14
19
  DEFAULT_REPO_DIR,
15
20
  PortMapping,
21
+ ProbeConfig,
16
22
  PythonVersion,
17
23
  RunConfigurationType,
18
24
  ServiceConfiguration,
@@ -26,6 +32,7 @@ from dstack._internal.core.models.runs import (
26
32
  AppSpec,
27
33
  JobSpec,
28
34
  JobSSHKey,
35
+ ProbeSpec,
29
36
  Requirements,
30
37
  Retry,
31
38
  RunSpec,
@@ -155,6 +162,7 @@ class JobConfigurator(ABC):
155
162
  repo_code_hash=self.run_spec.repo_code_hash,
156
163
  file_archives=self.run_spec.file_archives,
157
164
  service_port=self._service_port(),
165
+ probes=self._probes(),
158
166
  )
159
167
  return job_spec
160
168
 
@@ -313,6 +321,11 @@ class JobConfigurator(ABC):
313
321
  return self.run_spec.configuration.port.container_port
314
322
  return None
315
323
 
324
+ def _probes(self) -> list[ProbeSpec]:
325
+ if isinstance(self.run_spec.configuration, ServiceConfiguration):
326
+ return list(map(_probe_config_to_spec, self.run_spec.configuration.probes))
327
+ return []
328
+
316
329
 
317
330
  def interpolate_job_volumes(
318
331
  run_volumes: List[Union[MountPoint, str]],
@@ -353,6 +366,19 @@ def interpolate_job_volumes(
353
366
  return job_volumes
354
367
 
355
368
 
369
+ def _probe_config_to_spec(c: ProbeConfig) -> ProbeSpec:
370
+ return ProbeSpec(
371
+ type=c.type,
372
+ url=c.url if c.url is not None else DEFAULT_PROBE_URL,
373
+ timeout=c.timeout if c.timeout is not None else DEFAULT_PROBE_TIMEOUT,
374
+ interval=c.interval if c.interval is not None else DEFAULT_PROBE_INTERVAL,
375
+ ready_after=c.ready_after if c.ready_after is not None else DEFAULT_PROBE_READY_AFTER,
376
+ method=c.method if c.method is not None else DEFAULT_PROBE_METHOD,
377
+ headers=c.headers,
378
+ body=c.body,
379
+ )
380
+
381
+
356
382
  def _join_shell_commands(commands: List[str]) -> str:
357
383
  for i, cmd in enumerate(commands):
358
384
  cmd = cmd.strip()
@@ -1,9 +1,9 @@
1
1
  from typing import Union
2
2
 
3
- from dstack._internal.server.models import GatewayModel, JobModel, RunModel
3
+ from dstack._internal.server.models import GatewayModel, JobModel, ProbeModel, RunModel
4
4
 
5
5
 
6
- def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
6
+ def fmt(model: Union[RunModel, JobModel, GatewayModel, ProbeModel]) -> str:
7
7
  """Consistent string representation of a model for logging."""
8
8
  if isinstance(model, RunModel):
9
9
  return f"run({model.id.hex[:6]}){model.run_name}"
@@ -11,4 +11,6 @@ def fmt(model: Union[RunModel, JobModel, GatewayModel]) -> str:
11
11
  return f"job({model.id.hex[:6]}){model.job_name}"
12
12
  if isinstance(model, GatewayModel):
13
13
  return f"gateway({model.id.hex[:6]}){model.name}"
14
+ if isinstance(model, ProbeModel):
15
+ return f"probe({model.id.hex[:6]}){model.name}"
14
16
  return str(model)