dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (71) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/configurators.py +9 -0
  11. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  12. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  13. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  14. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  15. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  16. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  17. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  18. dstack/_internal/core/backends/models.py +8 -0
  19. dstack/_internal/core/backends/nebius/compute.py +8 -2
  20. dstack/_internal/core/backends/nebius/fabrics.py +1 -0
  21. dstack/_internal/core/backends/nebius/resources.py +9 -0
  22. dstack/_internal/core/compatibility/runs.py +8 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/runs.py +21 -1
  29. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  30. dstack/_internal/server/app.py +4 -0
  31. dstack/_internal/server/background/__init__.py +4 -0
  32. dstack/_internal/server/background/tasks/process_instances.py +107 -56
  33. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  34. dstack/_internal/server/background/tasks/process_running_jobs.py +13 -0
  35. dstack/_internal/server/background/tasks/process_runs.py +21 -14
  36. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  37. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  38. dstack/_internal/server/models.py +41 -0
  39. dstack/_internal/server/routers/instances.py +33 -5
  40. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  41. dstack/_internal/server/schemas/instances.py +32 -0
  42. dstack/_internal/server/schemas/runner.py +5 -0
  43. dstack/_internal/server/services/instances.py +103 -1
  44. dstack/_internal/server/services/jobs/__init__.py +8 -1
  45. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  46. dstack/_internal/server/services/logging.py +4 -2
  47. dstack/_internal/server/services/logs/aws.py +13 -1
  48. dstack/_internal/server/services/logs/gcp.py +16 -1
  49. dstack/_internal/server/services/probes.py +6 -0
  50. dstack/_internal/server/services/projects.py +16 -4
  51. dstack/_internal/server/services/runner/client.py +52 -20
  52. dstack/_internal/server/services/runner/ssh.py +4 -4
  53. dstack/_internal/server/services/runs.py +49 -13
  54. dstack/_internal/server/services/ssh.py +66 -0
  55. dstack/_internal/server/settings.py +13 -0
  56. dstack/_internal/server/statics/index.html +1 -1
  57. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  58. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  59. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  60. dstack/_internal/server/testing/common.py +44 -0
  61. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  62. dstack/_internal/settings.py +3 -0
  63. dstack/_internal/utils/common.py +15 -0
  64. dstack/api/server/__init__.py +1 -1
  65. dstack/version.py +1 -1
  66. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/METADATA +14 -14
  67. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/RECORD +71 -58
  68. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  69. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/WHEEL +0 -0
  70. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/entry_points.txt +0 -0
  71. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,60 @@
1
+ import json
2
+
3
+ from dstack._internal.core.backends.base.configurator import (
4
+ BackendRecord,
5
+ Configurator,
6
+ )
7
+ from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
8
+ from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend
9
+ from dstack._internal.core.backends.hotaisle.models import (
10
+ AnyHotAisleBackendConfig,
11
+ AnyHotAisleCreds,
12
+ HotAisleBackendConfig,
13
+ HotAisleBackendConfigWithCreds,
14
+ HotAisleConfig,
15
+ HotAisleCreds,
16
+ HotAisleStoredConfig,
17
+ )
18
+ from dstack._internal.core.models.backends.base import (
19
+ BackendType,
20
+ )
21
+
22
+
23
+ class HotAisleConfigurator(Configurator):
24
+ TYPE = BackendType.HOTAISLE
25
+ BACKEND_CLASS = HotAisleBackend
26
+
27
+ def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool):
28
+ self._validate_creds(config.creds, config.team_handle)
29
+
30
+ def create_backend(
31
+ self, project_name: str, config: HotAisleBackendConfigWithCreds
32
+ ) -> BackendRecord:
33
+ return BackendRecord(
34
+ config=HotAisleStoredConfig(
35
+ **HotAisleBackendConfig.__response__.parse_obj(config).dict()
36
+ ).json(),
37
+ auth=HotAisleCreds.parse_obj(config.creds).json(),
38
+ )
39
+
40
+ def get_backend_config(
41
+ self, record: BackendRecord, include_creds: bool
42
+ ) -> AnyHotAisleBackendConfig:
43
+ config = self._get_config(record)
44
+ if include_creds:
45
+ return HotAisleBackendConfigWithCreds.__response__.parse_obj(config)
46
+ return HotAisleBackendConfig.__response__.parse_obj(config)
47
+
48
+ def get_backend(self, record: BackendRecord) -> HotAisleBackend:
49
+ config = self._get_config(record)
50
+ return HotAisleBackend(config=config)
51
+
52
+ def _get_config(self, record: BackendRecord) -> HotAisleConfig:
53
+ return HotAisleConfig.__response__(
54
+ **json.loads(record.config),
55
+ creds=HotAisleCreds.parse_raw(record.auth),
56
+ )
57
+
58
+ def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str):
59
+ api_client = HotAisleAPIClient(creds.api_key, team_handle)
60
+ api_client.validate_api_key()
@@ -0,0 +1,45 @@
1
+ from typing import Annotated, List, Literal, Optional, Union
2
+
3
+ from pydantic import Field
4
+
5
+ from dstack._internal.core.models.common import CoreModel
6
+
7
+
8
+ class HotAisleAPIKeyCreds(CoreModel):
9
+ type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
10
+ api_key: Annotated[str, Field(description="The Hot Aisle API key")]
11
+
12
+
13
+ AnyHotAisleCreds = HotAisleAPIKeyCreds
14
+ HotAisleCreds = AnyHotAisleCreds
15
+
16
+
17
+ class HotAisleBackendConfig(CoreModel):
18
+ type: Annotated[
19
+ Literal["hotaisle"],
20
+ Field(description="The type of backend"),
21
+ ] = "hotaisle"
22
+ team_handle: Annotated[str, Field(description="The Hot Aisle team handle")]
23
+ regions: Annotated[
24
+ Optional[List[str]],
25
+ Field(description="The list of Hot Aisle regions. Omit to use all regions"),
26
+ ] = None
27
+
28
+
29
+ class HotAisleBackendConfigWithCreds(HotAisleBackendConfig):
30
+ creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
31
+
32
+
33
+ AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds]
34
+
35
+
36
+ class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig):
37
+ creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
38
+
39
+
40
+ class HotAisleStoredConfig(HotAisleBackendConfig):
41
+ pass
42
+
43
+
44
+ class HotAisleConfig(HotAisleStoredConfig):
45
+ creds: AnyHotAisleCreds
@@ -206,10 +206,11 @@ def _launch_runner(
206
206
  ssh_private_key: str,
207
207
  launch_command: str,
208
208
  ):
209
+ daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
209
210
  _run_ssh_command(
210
211
  hostname=hostname,
211
212
  ssh_private_key=ssh_private_key,
212
- command=launch_command,
213
+ command=daemonized_command,
213
214
  )
214
215
 
215
216
 
@@ -29,6 +29,11 @@ from dstack._internal.core.backends.gcp.models import (
29
29
  GCPBackendConfigWithCreds,
30
30
  GCPBackendFileConfigWithCreds,
31
31
  )
32
+ from dstack._internal.core.backends.hotaisle.models import (
33
+ HotAisleBackendConfig,
34
+ HotAisleBackendConfigWithCreds,
35
+ HotAisleBackendFileConfigWithCreds,
36
+ )
32
37
  from dstack._internal.core.backends.kubernetes.models import (
33
38
  KubernetesBackendConfig,
34
39
  KubernetesBackendConfigWithCreds,
@@ -73,6 +78,7 @@ AnyBackendConfigWithoutCreds = Union[
73
78
  CudoBackendConfig,
74
79
  DataCrunchBackendConfig,
75
80
  GCPBackendConfig,
81
+ HotAisleBackendConfig,
76
82
  KubernetesBackendConfig,
77
83
  LambdaBackendConfig,
78
84
  NebiusBackendConfig,
@@ -95,6 +101,7 @@ AnyBackendConfigWithCreds = Union[
95
101
  CudoBackendConfigWithCreds,
96
102
  DataCrunchBackendConfigWithCreds,
97
103
  GCPBackendConfigWithCreds,
104
+ HotAisleBackendConfigWithCreds,
98
105
  KubernetesBackendConfigWithCreds,
99
106
  LambdaBackendConfigWithCreds,
100
107
  OCIBackendConfigWithCreds,
@@ -116,6 +123,7 @@ AnyBackendFileConfigWithCreds = Union[
116
123
  CudoBackendConfigWithCreds,
117
124
  DataCrunchBackendConfigWithCreds,
118
125
  GCPBackendFileConfigWithCreds,
126
+ HotAisleBackendFileConfigWithCreds,
119
127
  KubernetesBackendFileConfigWithCreds,
120
128
  LambdaBackendConfigWithCreds,
121
129
  OCIBackendConfigWithCreds,
@@ -74,6 +74,7 @@ SETUP_COMMANDS = [
74
74
  SUPPORTED_PLATFORMS = [
75
75
  "gpu-h100-sxm",
76
76
  "gpu-h200-sxm",
77
+ "gpu-b200-sxm",
77
78
  "gpu-l40s-a",
78
79
  "gpu-l40s-d",
79
80
  "cpu-d3",
@@ -150,12 +151,16 @@ class NebiusCompute(
150
151
  )
151
152
  if backend_data.cluster is not None:
152
153
  cluster_id = backend_data.cluster.id
154
+
155
+ gpus = instance_offer.instance.resources.gpus
153
156
  create_disk_op = resources.create_disk(
154
157
  sdk=self._sdk,
155
158
  name=instance_name,
156
159
  project_id=self._region_to_project_id[instance_offer.region],
157
160
  size_mib=instance_offer.instance.resources.disk.size_mib,
158
- image_family="ubuntu22.04-cuda12",
161
+ image_family="ubuntu24.04-cuda12"
162
+ if gpus and gpus[0].name == "B200"
163
+ else "ubuntu22.04-cuda12",
159
164
  )
160
165
  create_instance_op = None
161
166
  try:
@@ -180,6 +185,7 @@ class NebiusCompute(
180
185
  cluster_id=cluster_id,
181
186
  disk_id=create_disk_op.resource_id,
182
187
  subnet_id=self._get_subnet_id(instance_offer.region),
188
+ preemptible=instance_offer.instance.resources.spot,
183
189
  )
184
190
  _wait_for_instance(self._sdk, create_instance_op)
185
191
  except BaseException:
@@ -367,4 +373,4 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
367
373
 
368
374
  def _supported_instances(offer: InstanceOffer) -> bool:
369
375
  platform, _ = offer.instance.name.split()
370
- return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
376
+ return platform in SUPPORTED_PLATFORMS
@@ -21,6 +21,7 @@ INFINIBAND_FABRICS = [
21
21
  InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22
22
  InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23
23
  InfinibandFabric("us-central1-a", "gpu-h200-sxm", "us-central1"),
24
+ InfinibandFabric("us-central1-b", "gpu-b200-sxm", "us-central1"),
24
25
  ]
25
26
 
26
27
 
@@ -28,10 +28,12 @@ from nebius.api.nebius.compute.v1 import (
28
28
  GpuClusterSpec,
29
29
  Instance,
30
30
  InstanceGpuClusterSpec,
31
+ InstanceRecoveryPolicy,
31
32
  InstanceServiceClient,
32
33
  InstanceSpec,
33
34
  IPAddress,
34
35
  NetworkInterfaceSpec,
36
+ PreemptibleSpec,
35
37
  PublicIPAddress,
36
38
  ResourcesSpec,
37
39
  SourceImageFamily,
@@ -283,6 +285,7 @@ def create_instance(
283
285
  cluster_id: Optional[str],
284
286
  disk_id: str,
285
287
  subnet_id: str,
288
+ preemptible: bool,
286
289
  ) -> SDKOperation[Operation]:
287
290
  client = InstanceServiceClient(sdk)
288
291
  request = CreateInstanceRequest(
@@ -306,6 +309,12 @@ def create_instance(
306
309
  public_ip_address=PublicIPAddress(static=True),
307
310
  )
308
311
  ],
312
+ preemptible=PreemptibleSpec(
313
+ priority=1, on_preemption=PreemptibleSpec.PreemptionPolicy.STOP
314
+ )
315
+ if preemptible
316
+ else None,
317
+ recovery_policy=InstanceRecoveryPolicy.FAIL if preemptible else None,
309
318
  ),
310
319
  )
311
320
  with wrap_capacity_errors():
@@ -53,6 +53,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
53
53
  job_submissions_excludes["exit_status"] = True
54
54
  if all(js.deployment_num == 0 for js in job_submissions):
55
55
  job_submissions_excludes["deployment_num"] = True
56
+ if all(not js.probes for js in job_submissions):
57
+ job_submissions_excludes["probes"] = True
56
58
  latest_job_submission = current_resource.latest_job_submission
57
59
  if latest_job_submission is not None:
58
60
  latest_job_submission_excludes: IncludeExcludeDictType = {}
@@ -69,6 +71,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
69
71
  latest_job_submission_excludes["exit_status"] = True
70
72
  if latest_job_submission.deployment_num == 0:
71
73
  latest_job_submission_excludes["deployment_num"] = True
74
+ if not latest_job_submission.probes:
75
+ latest_job_submission_excludes["probes"] = True
72
76
  return {"plan": apply_plan_excludes}
73
77
 
74
78
 
@@ -120,6 +124,8 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
120
124
  profile_excludes.add("startup_order")
121
125
  if configuration.stop_criteria is None:
122
126
  configuration_excludes["stop_criteria"] = True
127
+ if isinstance(configuration, ServiceConfiguration) and not configuration.probes:
128
+ configuration_excludes["probes"] = True
123
129
  if profile is not None and profile.stop_criteria is None:
124
130
  profile_excludes.add("stop_criteria")
125
131
  if not configuration.files:
@@ -154,6 +160,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
154
160
  spec_excludes["file_archives"] = True
155
161
  if all(s.service_port is None for s in job_specs):
156
162
  spec_excludes["service_port"] = True
163
+ if all(not s.probes for s in job_specs):
164
+ spec_excludes["probes"] = True
157
165
 
158
166
  return spec_excludes
159
167
 
@@ -11,6 +11,7 @@ class BackendType(str, enum.Enum):
11
11
  DSTACK (BackendType): dstack Sky
12
12
  GCP (BackendType): Google Cloud Platform
13
13
  DATACRUNCH (BackendType): DataCrunch
14
+ HOTAISLE (BackendType): Hot Aisle
14
15
  KUBERNETES (BackendType): Kubernetes
15
16
  LAMBDA (BackendType): Lambda Cloud
16
17
  NEBIUS (BackendType): Nebius AI Cloud
@@ -28,6 +29,7 @@ class BackendType(str, enum.Enum):
28
29
  DATACRUNCH = "datacrunch"
29
30
  DSTACK = "dstack"
30
31
  GCP = "gcp"
32
+ HOTAISLE = "hotaisle"
31
33
  KUBERNETES = "kubernetes"
32
34
  LAMBDA = "lambda"
33
35
  LOCAL = "local"
@@ -14,11 +14,12 @@ from dstack._internal.core.models.envs import Env
14
14
  from dstack._internal.core.models.files import FilePathMapping
15
15
  from dstack._internal.core.models.fleets import FleetConfiguration
16
16
  from dstack._internal.core.models.gateways import GatewayConfiguration
17
- from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
17
+ from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
18
18
  from dstack._internal.core.models.resources import Range, ResourcesSpec
19
19
  from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
20
20
  from dstack._internal.core.models.unix import UnixUser
21
21
  from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
22
+ from dstack._internal.utils.common import has_duplicates
22
23
  from dstack._internal.utils.json_utils import (
23
24
  pydantic_orjson_dumps_with_indent,
24
25
  )
@@ -32,6 +33,14 @@ RUN_PRIOTIRY_MIN = 0
32
33
  RUN_PRIOTIRY_MAX = 100
33
34
  RUN_PRIORITY_DEFAULT = 0
34
35
  DEFAULT_REPO_DIR = "/workflow"
36
+ MIN_PROBE_TIMEOUT = 1
37
+ MIN_PROBE_INTERVAL = 1
38
+ DEFAULT_PROBE_URL = "/"
39
+ DEFAULT_PROBE_TIMEOUT = 10
40
+ DEFAULT_PROBE_INTERVAL = 15
41
+ DEFAULT_PROBE_READY_AFTER = 1
42
+ DEFAULT_PROBE_METHOD = "get"
43
+ MAX_PROBE_URL_LEN = 2048
35
44
 
36
45
 
37
46
  class RunConfigurationType(str, Enum):
@@ -162,6 +171,121 @@ class RateLimit(CoreModel):
162
171
  ] = 0
163
172
 
164
173
 
174
+ HTTPMethod = Literal["get", "post", "put", "delete", "patch", "head"]
175
+
176
+
177
+ class HTTPHeaderSpec(CoreModel):
178
+ name: Annotated[
179
+ str,
180
+ Field(
181
+ description="The name of the HTTP header",
182
+ min_length=1,
183
+ max_length=256,
184
+ ),
185
+ ]
186
+ value: Annotated[
187
+ str,
188
+ Field(
189
+ description="The value of the HTTP header",
190
+ min_length=1,
191
+ max_length=2048,
192
+ ),
193
+ ]
194
+
195
+
196
+ class ProbeConfig(CoreModel):
197
+ type: Literal["http"] # expect other probe types in the future, namely `exec`
198
+ url: Annotated[
199
+ Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`")
200
+ ] = None
201
+ method: Annotated[
202
+ Optional[HTTPMethod],
203
+ Field(
204
+ description=(
205
+ "The HTTP method to use for the probe (e.g., `get`, `post`, etc.)."
206
+ f" Defaults to `{DEFAULT_PROBE_METHOD}`"
207
+ )
208
+ ),
209
+ ] = None
210
+ headers: Annotated[
211
+ list[HTTPHeaderSpec],
212
+ Field(description="A list of HTTP headers to include in the request", max_items=16),
213
+ ] = []
214
+ body: Annotated[
215
+ Optional[str],
216
+ Field(
217
+ description="The HTTP request body to send with the probe",
218
+ min_length=1,
219
+ max_length=2048,
220
+ ),
221
+ ] = None
222
+ timeout: Annotated[
223
+ Optional[Union[int, str]],
224
+ Field(
225
+ description=(
226
+ f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`"
227
+ )
228
+ ),
229
+ ] = None
230
+ interval: Annotated[
231
+ Optional[Union[int, str]],
232
+ Field(
233
+ description=(
234
+ "Minimum amount of time between the end of one probe execution"
235
+ f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`"
236
+ )
237
+ ),
238
+ ] = None
239
+ ready_after: Annotated[
240
+ Optional[int],
241
+ Field(
242
+ ge=1,
243
+ description=(
244
+ "The number of consecutive successful probe executions required for the replica"
245
+ " to be considered ready. Used during rolling deployments."
246
+ f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`"
247
+ ),
248
+ ),
249
+ ] = None
250
+
251
+ @validator("timeout")
252
+ def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]:
253
+ if v is None:
254
+ return v
255
+ parsed = parse_duration(v)
256
+ if parsed < MIN_PROBE_TIMEOUT:
257
+ raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
258
+ return parsed
259
+
260
+ @validator("interval")
261
+ def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]:
262
+ if v is None:
263
+ return v
264
+ parsed = parse_duration(v)
265
+ if parsed < MIN_PROBE_INTERVAL:
266
+ raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
267
+ return parsed
268
+
269
+ @validator("url")
270
+ def validate_url(cls, v: Optional[str]) -> Optional[str]:
271
+ if v is None:
272
+ return v
273
+ if not v.startswith("/"):
274
+ raise ValueError("Must start with `/`")
275
+ if len(v) > MAX_PROBE_URL_LEN:
276
+ raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters")
277
+ if not v.isprintable():
278
+ raise ValueError("Cannot contain non-printable characters")
279
+ return v
280
+
281
+ @root_validator
282
+ def validate_body_matches_method(cls, values):
283
+ method: HTTPMethod = values["method"]
284
+ if values["body"] is not None and method in ["get", "head"]:
285
+ raise ValueError(f"Cannot set request body for the `{method}` method")
286
+ return values
287
+
288
+
165
289
  class BaseRunConfiguration(CoreModel):
166
290
  type: Literal["none"]
167
291
  name: Annotated[
@@ -448,6 +572,10 @@ class ServiceConfigurationParams(CoreModel):
448
572
  Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
449
573
  ] = None
450
574
  rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
575
+ probes: Annotated[
576
+ list[ProbeConfig],
577
+ Field(description="List of probes used to determine job health"),
578
+ ] = []
451
579
 
452
580
  @validator("port")
453
581
  def convert_port(cls, v) -> PortMapping:
@@ -511,6 +639,16 @@ class ServiceConfigurationParams(CoreModel):
511
639
  )
512
640
  return v
513
641
 
642
+ @validator("probes")
643
+ def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
644
+ if has_duplicates(v):
645
+ # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
646
+ # https://github.com/pydantic/pydantic/issues/3765
647
+ # Because of the bug, our gen_schema_reference.py fails to determine the type of
648
+ # ServiceConfiguration.probes and insert the correct hyperlink.
649
+ raise ValueError("Probes must be unique")
650
+ return v
651
+
514
652
 
515
653
  class ServiceConfiguration(
516
654
  ProfileParams, BaseRunConfigurationWithCommands, ServiceConfigurationParams
@@ -0,0 +1,28 @@
1
+ from datetime import datetime
2
+ from enum import Enum
3
+
4
+ from dstack._internal.core.models.common import CoreModel
5
+
6
+
7
+ class HealthStatus(str, Enum):
8
+ HEALTHY = "healthy"
9
+ WARNING = "warning"
10
+ FAILURE = "failure"
11
+
12
+ def is_healthy(self) -> bool:
13
+ return self == self.HEALTHY
14
+
15
+ def is_failure(self) -> bool:
16
+ return self == self.FAILURE
17
+
18
+
19
+ class HealthEvent(CoreModel):
20
+ timestamp: datetime
21
+ status: HealthStatus
22
+ message: str
23
+
24
+
25
+ class HealthCheck(CoreModel):
26
+ collected_at: datetime
27
+ status: HealthStatus
28
+ events: list[HealthEvent]
@@ -9,6 +9,7 @@ from pydantic import root_validator
9
9
  from dstack._internal.core.models.backends.base import BackendType
10
10
  from dstack._internal.core.models.common import CoreModel
11
11
  from dstack._internal.core.models.envs import Env
12
+ from dstack._internal.core.models.health import HealthStatus
12
13
  from dstack._internal.core.models.volumes import Volume
13
14
  from dstack._internal.utils.common import pretty_resources
14
15
 
@@ -225,6 +226,7 @@ class Instance(CoreModel):
225
226
  hostname: Optional[str] = None
226
227
  status: InstanceStatus
227
228
  unreachable: bool = False
229
+ health_status: HealthStatus = HealthStatus.HEALTHY
228
230
  termination_reason: Optional[str] = None
229
231
  created: datetime.datetime
230
232
  region: Optional[str] = None
@@ -23,4 +23,5 @@ class LogEvent(CoreModel):
23
23
 
24
24
  class JobSubmissionLogs(CoreModel):
25
25
  logs: List[LogEvent]
26
- next_token: Optional[str]
26
+ external_url: Optional[str] = None
27
+ next_token: Optional[str] = None
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime, timedelta
2
2
  from enum import Enum
3
- from typing import Any, Dict, List, Optional, Type
3
+ from typing import Any, Dict, List, Literal, Optional, Type
4
4
 
5
5
  from pydantic import UUID4, Field, root_validator
6
6
  from typing_extensions import Annotated
@@ -8,8 +8,11 @@ from typing_extensions import Annotated
8
8
  from dstack._internal.core.models.backends.base import BackendType
9
9
  from dstack._internal.core.models.common import ApplyAction, CoreModel, NetworkMode, RegistryAuth
10
10
  from dstack._internal.core.models.configurations import (
11
+ DEFAULT_PROBE_METHOD,
11
12
  DEFAULT_REPO_DIR,
12
13
  AnyRunConfiguration,
14
+ HTTPHeaderSpec,
15
+ HTTPMethod,
13
16
  RunConfiguration,
14
17
  ServiceConfiguration,
15
18
  )
@@ -223,6 +226,17 @@ class JobSSHKey(CoreModel):
223
226
  public: str
224
227
 
225
228
 
229
+ class ProbeSpec(CoreModel):
230
+ type: Literal["http"] # expect other probe types in the future, namely `exec`
231
+ url: str
232
+ method: HTTPMethod = DEFAULT_PROBE_METHOD
233
+ headers: list[HTTPHeaderSpec] = []
234
+ body: Optional[str] = None
235
+ timeout: int
236
+ interval: int
237
+ ready_after: int
238
+
239
+
226
240
  class JobSpec(CoreModel):
227
241
  replica_num: int = 0 # default value for backward compatibility
228
242
  job_num: int
@@ -256,6 +270,7 @@ class JobSpec(CoreModel):
256
270
  file_archives: list[FileArchiveMapping] = []
257
271
  # None for non-services and pre-0.19.19 services. See `get_service_port`
258
272
  service_port: Optional[int] = None
273
+ probes: list[ProbeSpec] = []
259
274
 
260
275
 
261
276
  class JobProvisioningData(CoreModel):
@@ -325,6 +340,10 @@ class ClusterInfo(CoreModel):
325
340
  gpus_per_job: int
326
341
 
327
342
 
343
+ class Probe(CoreModel):
344
+ success_streak: int
345
+
346
+
328
347
  class JobSubmission(CoreModel):
329
348
  id: UUID4
330
349
  submission_num: int
@@ -341,6 +360,7 @@ class JobSubmission(CoreModel):
341
360
  job_provisioning_data: Optional[JobProvisioningData]
342
361
  job_runtime_data: Optional[JobRuntimeData]
343
362
  error: Optional[str] = None
363
+ probes: list[Probe] = []
344
364
 
345
365
  @property
346
366
  def age(self) -> timedelta:
@@ -236,6 +236,13 @@ class SSHTunnel:
236
236
  def __exit__(self, exc_type, exc_val, exc_tb):
237
237
  self.close()
238
238
 
239
+ async def __aenter__(self):
240
+ await self.aopen()
241
+ return self
242
+
243
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
244
+ await self.aclose()
245
+
239
246
  def _get_proxy_command(self) -> Optional[str]:
240
247
  proxy_command: Optional[str] = None
241
248
  for params, identity_path in self.ssh_proxies:
@@ -22,6 +22,7 @@ from dstack._internal.proxy.lib.deps import get_injector_from_app
22
22
  from dstack._internal.proxy.lib.routers import model_proxy
23
23
  from dstack._internal.server import settings
24
24
  from dstack._internal.server.background import start_background_tasks
25
+ from dstack._internal.server.background.tasks.process_probes import PROBES_SCHEDULER
25
26
  from dstack._internal.server.db import get_db, get_session_ctx, migrate
26
27
  from dstack._internal.server.routers import (
27
28
  backends,
@@ -155,6 +156,7 @@ async def lifespan(app: FastAPI):
155
156
  scheduler = start_background_tasks()
156
157
  else:
157
158
  logger.info("Background processing is disabled")
159
+ PROBES_SCHEDULER.start()
158
160
  dstack_version = DSTACK_VERSION if DSTACK_VERSION else "(no version)"
159
161
  logger.info(f"The admin token is {admin.token.get_plaintext_or_error()}", {"show_path": False})
160
162
  logger.info(
@@ -166,6 +168,7 @@ async def lifespan(app: FastAPI):
166
168
  yield
167
169
  if settings.SERVER_BACKGROUND_PROCESSING_ENABLED:
168
170
  scheduler.shutdown()
171
+ PROBES_SCHEDULER.shutdown(wait=False)
169
172
  await gateway_connections_pool.remove_all()
170
173
  service_conn_pool = await get_injector_from_app(app).get_service_connection_pool()
171
174
  await service_conn_pool.remove_all()
@@ -197,6 +200,7 @@ def register_routes(app: FastAPI, ui: bool = True):
197
200
  app.include_router(fleets.root_router)
198
201
  app.include_router(fleets.project_router)
199
202
  app.include_router(instances.root_router)
203
+ app.include_router(instances.project_router)
200
204
  app.include_router(repos.router)
201
205
  app.include_router(runs.root_router)
202
206
  app.include_router(runs.project_router)
@@ -9,6 +9,7 @@ from dstack._internal.server.background.tasks.process_gateways import (
9
9
  )
10
10
  from dstack._internal.server.background.tasks.process_idle_volumes import process_idle_volumes
11
11
  from dstack._internal.server.background.tasks.process_instances import (
12
+ delete_instance_health_checks,
12
13
  process_instances,
13
14
  )
14
15
  from dstack._internal.server.background.tasks.process_metrics import (
@@ -18,6 +19,7 @@ from dstack._internal.server.background.tasks.process_metrics import (
18
19
  from dstack._internal.server.background.tasks.process_placement_groups import (
19
20
  process_placement_groups,
20
21
  )
22
+ from dstack._internal.server.background.tasks.process_probes import process_probes
21
23
  from dstack._internal.server.background.tasks.process_prometheus_metrics import (
22
24
  collect_prometheus_metrics,
23
25
  delete_prometheus_metrics,
@@ -63,6 +65,7 @@ def start_background_tasks() -> AsyncIOScheduler:
63
65
  # that the first waiting for the lock will acquire it.
64
66
  # The jitter is needed to give all tasks a chance to acquire locks.
65
67
 
68
+ _scheduler.add_job(process_probes, IntervalTrigger(seconds=3, jitter=1))
66
69
  _scheduler.add_job(collect_metrics, IntervalTrigger(seconds=10), max_instances=1)
67
70
  _scheduler.add_job(delete_metrics, IntervalTrigger(minutes=5), max_instances=1)
68
71
  if settings.ENABLE_PROMETHEUS_METRICS:
@@ -84,6 +87,7 @@ def start_background_tasks() -> AsyncIOScheduler:
84
87
  IntervalTrigger(seconds=10, jitter=2),
85
88
  max_instances=1,
86
89
  )
90
+ _scheduler.add_job(delete_instance_health_checks, IntervalTrigger(minutes=5), max_instances=1)
87
91
  for replica in range(settings.SERVER_BACKGROUND_PROCESSING_FACTOR):
88
92
  # Add multiple copies of tasks if requested.
89
93
  # max_instances=1 for additional copies to avoid running too many tasks.