dstack 0.19.21__py3-none-any.whl → 0.19.23rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (71) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/configurators.py +9 -0
  11. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  12. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  13. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  14. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  15. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  16. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  17. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  18. dstack/_internal/core/backends/models.py +8 -0
  19. dstack/_internal/core/backends/nebius/compute.py +8 -2
  20. dstack/_internal/core/backends/nebius/fabrics.py +1 -0
  21. dstack/_internal/core/backends/nebius/resources.py +9 -0
  22. dstack/_internal/core/compatibility/runs.py +8 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/runs.py +21 -1
  29. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  30. dstack/_internal/server/app.py +4 -0
  31. dstack/_internal/server/background/__init__.py +4 -0
  32. dstack/_internal/server/background/tasks/process_instances.py +107 -56
  33. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  34. dstack/_internal/server/background/tasks/process_running_jobs.py +13 -0
  35. dstack/_internal/server/background/tasks/process_runs.py +21 -14
  36. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  37. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  38. dstack/_internal/server/models.py +41 -0
  39. dstack/_internal/server/routers/instances.py +33 -5
  40. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  41. dstack/_internal/server/schemas/instances.py +32 -0
  42. dstack/_internal/server/schemas/runner.py +5 -0
  43. dstack/_internal/server/services/instances.py +103 -1
  44. dstack/_internal/server/services/jobs/__init__.py +8 -1
  45. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  46. dstack/_internal/server/services/logging.py +4 -2
  47. dstack/_internal/server/services/logs/aws.py +13 -1
  48. dstack/_internal/server/services/logs/gcp.py +16 -1
  49. dstack/_internal/server/services/probes.py +6 -0
  50. dstack/_internal/server/services/projects.py +16 -4
  51. dstack/_internal/server/services/runner/client.py +52 -20
  52. dstack/_internal/server/services/runner/ssh.py +4 -4
  53. dstack/_internal/server/services/runs.py +49 -13
  54. dstack/_internal/server/services/ssh.py +66 -0
  55. dstack/_internal/server/settings.py +13 -0
  56. dstack/_internal/server/statics/index.html +1 -1
  57. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  58. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  59. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  60. dstack/_internal/server/testing/common.py +44 -0
  61. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  62. dstack/_internal/settings.py +3 -0
  63. dstack/_internal/utils/common.py +15 -0
  64. dstack/api/server/__init__.py +1 -1
  65. dstack/version.py +1 -1
  66. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/METADATA +14 -14
  67. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/RECORD +71 -58
  68. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  69. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/WHEEL +0 -0
  70. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/entry_points.txt +0 -0
  71. {dstack-0.19.21.dist-info → dstack-0.19.23rc1.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,7 +1,6 @@
1
1
  import uuid
2
- from dataclasses import dataclass
3
2
  from http import HTTPStatus
4
- from typing import BinaryIO, Dict, List, Optional, TypeVar, Union
3
+ from typing import BinaryIO, Dict, List, Literal, Optional, TypeVar, Union, overload
5
4
 
6
5
  import packaging.version
7
6
  import requests
@@ -14,9 +13,11 @@ from dstack._internal.core.models.repos.remote import RemoteRepoCreds
14
13
  from dstack._internal.core.models.resources import Memory
15
14
  from dstack._internal.core.models.runs import ClusterInfo, Job, Run
16
15
  from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
16
+ from dstack._internal.server.schemas.instances import InstanceCheck
17
17
  from dstack._internal.server.schemas.runner import (
18
18
  GPUDevice,
19
19
  HealthcheckResponse,
20
+ InstanceHealthResponse,
20
21
  LegacyPullResponse,
21
22
  LegacyStopBody,
22
23
  LegacySubmitBody,
@@ -37,15 +38,6 @@ UPLOAD_CODE_REQUEST_TIMEOUT = 60
37
38
  logger = get_logger(__name__)
38
39
 
39
40
 
40
- @dataclass
41
- class HealthStatus:
42
- healthy: bool
43
- reason: str
44
-
45
- def __str__(self) -> str:
46
- return self.reason
47
-
48
-
49
41
  class RunnerClient:
50
42
  def __init__(
51
43
  self,
@@ -193,6 +185,9 @@ class ShimClient:
193
185
  # API v1 (a.k.a. Legacy API) — `/api/{submit,pull,stop}`
194
186
  _API_V2_MIN_SHIM_VERSION = (0, 18, 34)
195
187
 
188
+ # `/api/instance/health`
189
+ _INSTANCE_HEALTH_MIN_SHIM_VERSION = (0, 19, 22)
190
+
196
191
  _shim_version: Optional["_Version"]
197
192
  _api_version: int
198
193
  _negotiated: bool = False
@@ -212,11 +207,25 @@ class ShimClient:
212
207
  self._negotiate()
213
208
  return self._api_version == 2
214
209
 
215
- def healthcheck(self, unmask_exeptions: bool = False) -> Optional[HealthcheckResponse]:
210
+ def is_instance_health_supported(self) -> bool:
211
+ if not self._negotiated:
212
+ self._negotiate()
213
+ return (
214
+ self._shim_version is None
215
+ or self._shim_version >= self._INSTANCE_HEALTH_MIN_SHIM_VERSION
216
+ )
217
+
218
+ @overload
219
+ def healthcheck(self) -> Optional[HealthcheckResponse]: ...
220
+
221
+ @overload
222
+ def healthcheck(self, unmask_exceptions: Literal[True]) -> HealthcheckResponse: ...
223
+
224
+ def healthcheck(self, unmask_exceptions: bool = False) -> Optional[HealthcheckResponse]:
216
225
  try:
217
226
  resp = self._request("GET", "/api/healthcheck", raise_for_status=True)
218
227
  except requests.exceptions.RequestException:
219
- if unmask_exeptions:
228
+ if unmask_exceptions:
220
229
  raise
221
230
  return None
222
231
  if not self._negotiated:
@@ -225,6 +234,17 @@ class ShimClient:
225
234
 
226
235
  # API v2 methods
227
236
 
237
+ def get_instance_health(self) -> Optional[InstanceHealthResponse]:
238
+ if not self.is_instance_health_supported():
239
+ logger.debug("instance health is not supported: %s", self._shim_version)
240
+ return None
241
+ resp = self._request("GET", "/api/instance/health")
242
+ if resp.status_code == HTTPStatus.NOT_FOUND:
243
+ logger.warning("instance health: %s", resp.text)
244
+ return None
245
+ self._raise_for_status(resp)
246
+ return self._response(InstanceHealthResponse, resp)
247
+
228
248
  def get_task(self, task_id: "_TaskID") -> TaskInfoResponse:
229
249
  if not self.is_api_v2_supported():
230
250
  raise ShimAPIVersionError()
@@ -418,14 +438,26 @@ class ShimClient:
418
438
  self._negotiated = True
419
439
 
420
440
 
421
- def health_response_to_health_status(data: HealthcheckResponse) -> HealthStatus:
422
- if data.service == "dstack-shim":
423
- return HealthStatus(healthy=True, reason="Service is OK")
424
- else:
425
- return HealthStatus(
426
- healthy=False,
427
- reason=f"Service name is {data.service}, service version: {data.version}",
441
+ def healthcheck_response_to_instance_check(
442
+ response: HealthcheckResponse,
443
+ instance_health_response: Optional[InstanceHealthResponse] = None,
444
+ ) -> InstanceCheck:
445
+ if response.service == "dstack-shim":
446
+ message: Optional[str] = None
447
+ if (
448
+ instance_health_response is not None
449
+ and instance_health_response.dcgm is not None
450
+ and instance_health_response.dcgm.incidents
451
+ ):
452
+ message = instance_health_response.dcgm.incidents[0].error_message
453
+ return InstanceCheck(
454
+ reachable=True, health_response=instance_health_response, message=message
428
455
  )
456
+ return InstanceCheck(
457
+ reachable=False,
458
+ message=f"unexpected service: {response.service} version: {response.version}",
459
+ health_response=instance_health_response,
460
+ )
429
461
 
430
462
 
431
463
  def _volume_to_shim_volume_info(volume: Volume, instance_id: str) -> ShimVolumeInfo:
@@ -2,7 +2,7 @@ import functools
2
2
  import socket
3
3
  import time
4
4
  from collections.abc import Iterable
5
- from typing import Callable, Dict, List, Optional, TypeVar, Union
5
+ from typing import Callable, Dict, List, Literal, Optional, TypeVar, Union
6
6
 
7
7
  import requests
8
8
  from typing_extensions import Concatenate, ParamSpec
@@ -27,7 +27,7 @@ def runner_ssh_tunnel(
27
27
  [Callable[Concatenate[Dict[int, int], P], R]],
28
28
  Callable[
29
29
  Concatenate[PrivateKeyOrPair, JobProvisioningData, Optional[JobRuntimeData], P],
30
- Union[bool, R],
30
+ Union[Literal[False], R],
31
31
  ],
32
32
  ]:
33
33
  """
@@ -42,7 +42,7 @@ def runner_ssh_tunnel(
42
42
  func: Callable[Concatenate[Dict[int, int], P], R],
43
43
  ) -> Callable[
44
44
  Concatenate[PrivateKeyOrPair, JobProvisioningData, Optional[JobRuntimeData], P],
45
- Union[bool, R],
45
+ Union[Literal[False], R],
46
46
  ]:
47
47
  @functools.wraps(func)
48
48
  def wrapper(
@@ -51,7 +51,7 @@ def runner_ssh_tunnel(
51
51
  job_runtime_data: Optional[JobRuntimeData],
52
52
  *args: P.args,
53
53
  **kwargs: P.kwargs,
54
- ) -> Union[bool, R]:
54
+ ) -> Union[Literal[False], R]:
55
55
  """
56
56
  Returns:
57
57
  is successful
@@ -1,6 +1,7 @@
1
1
  import itertools
2
2
  import math
3
3
  import uuid
4
+ from collections.abc import Iterable
4
5
  from datetime import datetime, timezone
5
6
  from typing import List, Optional
6
7
 
@@ -8,7 +9,7 @@ import pydantic
8
9
  from apscheduler.triggers.cron import CronTrigger
9
10
  from sqlalchemy import and_, func, or_, select, update
10
11
  from sqlalchemy.ext.asyncio import AsyncSession
11
- from sqlalchemy.orm import joinedload
12
+ from sqlalchemy.orm import joinedload, selectinload
12
13
 
13
14
  import dstack._internal.utils.common as common_utils
14
15
  from dstack._internal.core.errors import (
@@ -17,7 +18,11 @@ from dstack._internal.core.errors import (
17
18
  ServerClientError,
18
19
  )
19
20
  from dstack._internal.core.models.common import ApplyAction
20
- from dstack._internal.core.models.configurations import RUN_PRIORITY_DEFAULT, AnyRunConfiguration
21
+ from dstack._internal.core.models.configurations import (
22
+ RUN_PRIORITY_DEFAULT,
23
+ AnyRunConfiguration,
24
+ ServiceConfiguration,
25
+ )
21
26
  from dstack._internal.core.models.instances import (
22
27
  InstanceAvailability,
23
28
  InstanceOfferWithAvailability,
@@ -219,6 +224,7 @@ async def list_projects_run_models(
219
224
  select(RunModel)
220
225
  .where(*filters)
221
226
  .options(joinedload(RunModel.user).load_only(UserModel.name))
227
+ .options(selectinload(RunModel.jobs).joinedload(JobModel.probes))
222
228
  .order_by(*order_by)
223
229
  .limit(limit)
224
230
  )
@@ -260,6 +266,7 @@ async def get_run_by_name(
260
266
  RunModel.deleted == False,
261
267
  )
262
268
  .options(joinedload(RunModel.user))
269
+ .options(selectinload(RunModel.jobs).joinedload(JobModel.probes))
263
270
  )
264
271
  run_model = res.scalar()
265
272
  if run_model is None:
@@ -279,6 +286,7 @@ async def get_run_by_id(
279
286
  RunModel.id == run_id,
280
287
  )
281
288
  .options(joinedload(RunModel.user))
289
+ .options(selectinload(RunModel.jobs).joinedload(JobModel.probes))
282
290
  )
283
291
  run_model = res.scalar()
284
292
  if run_model is None:
@@ -557,8 +565,8 @@ async def submit_run(
557
565
  await session.commit()
558
566
  await session.refresh(run_model)
559
567
 
560
- run = run_model_to_run(run_model, return_in_api=True)
561
- return run
568
+ run = await get_run_by_id(session, project, run_model.id)
569
+ return common_utils.get_or_error(run)
562
570
 
563
571
 
564
572
  def create_job_model_for_new_submission(
@@ -583,6 +591,7 @@ def create_job_model_for_new_submission(
583
591
  termination_reason=None,
584
592
  job_spec_data=job.job_spec.json(),
585
593
  job_provisioning_data=None,
594
+ probes=[],
586
595
  )
587
596
 
588
597
 
@@ -740,7 +749,9 @@ def _get_run_jobs_with_submissions(
740
749
  job_models = list(job_models)[-job_submissions_limit:]
741
750
  for job_model in job_models:
742
751
  if job_submissions_limit != 0:
743
- job_submission = job_model_to_job_submission(job_model)
752
+ job_submission = job_model_to_job_submission(
753
+ job_model, include_probes=return_in_api
754
+ )
744
755
  if return_in_api:
745
756
  # Set default non-None values for 0.18 backward-compatibility
746
757
  # Remove in 0.19
@@ -980,12 +991,22 @@ def _validate_run_spec_and_set_defaults(run_spec: RunSpec):
980
991
  raise ServerClientError(
981
992
  f"Maximum utilization_policy.time_window is {settings.SERVER_METRICS_RUNNING_TTL_SECONDS}s"
982
993
  )
983
- if (
984
- run_spec.merged_profile.schedule
985
- and run_spec.configuration.type == "service"
986
- and run_spec.configuration.replicas.min == 0
987
- ):
988
- raise ServerClientError("Scheduled services with autoscaling to zero are not supported")
994
+ if isinstance(run_spec.configuration, ServiceConfiguration):
995
+ if run_spec.merged_profile.schedule and run_spec.configuration.replicas.min == 0:
996
+ raise ServerClientError(
997
+ "Scheduled services with autoscaling to zero are not supported"
998
+ )
999
+ if len(run_spec.configuration.probes) > settings.MAX_PROBES_PER_JOB:
1000
+ raise ServerClientError(
1001
+ f"Cannot configure more than {settings.MAX_PROBES_PER_JOB} probes"
1002
+ )
1003
+ if any(
1004
+ p.timeout is not None and p.timeout > settings.MAX_PROBE_TIMEOUT
1005
+ for p in run_spec.configuration.probes
1006
+ ):
1007
+ raise ServerClientError(
1008
+ f"Probe timeout cannot be longer than {settings.MAX_PROBE_TIMEOUT}s"
1009
+ )
989
1010
  if run_spec.configuration.priority is None:
990
1011
  run_spec.configuration.priority = RUN_PRIORITY_DEFAULT
991
1012
  set_resources_defaults(run_spec.configuration.resources)
@@ -1011,6 +1032,7 @@ _TYPE_SPECIFIC_CONF_UPDATABLE_FIELDS = {
1011
1032
  # rolling deployment
1012
1033
  # NOTE: keep this list in sync with the "Rolling deployment" section in services.md
1013
1034
  "port",
1035
+ "probes",
1014
1036
  "resources",
1015
1037
  "volumes",
1016
1038
  "docker",
@@ -1161,9 +1183,12 @@ async def scale_run_replicas(session: AsyncSession, run_model: RunModel, replica
1161
1183
  elif {JobStatus.PROVISIONING, JobStatus.PULLING} & statuses:
1162
1184
  # if there are any provisioning or pulling jobs, the replica is active and has the importance of 1
1163
1185
  active_replicas.append((1, is_out_of_date, replica_num, replica_jobs))
1164
- else:
1165
- # all jobs are running, the replica is active and has the importance of 2
1186
+ elif not is_replica_ready(replica_jobs):
1187
+ # all jobs are running, but probes are failing, the replica is active and has the importance of 2
1166
1188
  active_replicas.append((2, is_out_of_date, replica_num, replica_jobs))
1189
+ else:
1190
+ # all jobs are running and ready, the replica is active and has the importance of 3
1191
+ active_replicas.append((3, is_out_of_date, replica_num, replica_jobs))
1167
1192
 
1168
1193
  # sort by is_out_of_date (up-to-date first), importance (desc), and replica_num (asc)
1169
1194
  active_replicas.sort(key=lambda r: (r[1], -r[0], r[2]))
@@ -1246,6 +1271,17 @@ async def retry_run_replica_jobs(
1246
1271
  session.add(new_job_model)
1247
1272
 
1248
1273
 
1274
+ def is_replica_ready(jobs: Iterable[JobModel]) -> bool:
1275
+ if not all(job.status == JobStatus.RUNNING for job in jobs):
1276
+ return False
1277
+ for job in jobs:
1278
+ job_spec: JobSpec = JobSpec.__response__.parse_raw(job.job_spec_data)
1279
+ for probe_spec, probe in zip(job_spec.probes, job.probes):
1280
+ if probe.success_streak < probe_spec.ready_after:
1281
+ return False
1282
+ return True
1283
+
1284
+
1249
1285
  def _remove_job_spec_sensitive_info(spec: JobSpec):
1250
1286
  spec.ssh_key = None
1251
1287
 
@@ -0,0 +1,66 @@
1
+ from collections.abc import Iterable
2
+ from typing import Optional
3
+
4
+ import dstack._internal.server.services.jobs as jobs_services
5
+ from dstack._internal.core.consts import DSTACK_RUNNER_SSH_PORT
6
+ from dstack._internal.core.models.backends.base import BackendType
7
+ from dstack._internal.core.models.instances import RemoteConnectionInfo, SSHConnectionParams
8
+ from dstack._internal.core.models.runs import JobProvisioningData
9
+ from dstack._internal.core.services.ssh.tunnel import SSH_DEFAULT_OPTIONS, SocketPair, SSHTunnel
10
+ from dstack._internal.server.models import JobModel
11
+ from dstack._internal.utils.common import get_or_error
12
+ from dstack._internal.utils.path import FileContent
13
+
14
+
15
+ def container_ssh_tunnel(
16
+ job: JobModel,
17
+ forwarded_sockets: Iterable[SocketPair] = (),
18
+ options: dict[str, str] = SSH_DEFAULT_OPTIONS,
19
+ ) -> SSHTunnel:
20
+ """
21
+ Build SSHTunnel for connecting to the container running the specified job.
22
+ """
23
+
24
+ jpd: JobProvisioningData = JobProvisioningData.__response__.parse_raw(
25
+ job.job_provisioning_data
26
+ )
27
+ if not jpd.dockerized:
28
+ ssh_destination = f"{jpd.username}@{jpd.hostname}"
29
+ ssh_port = jpd.ssh_port
30
+ ssh_proxy = jpd.ssh_proxy
31
+ else:
32
+ ssh_destination = "root@localhost" # TODO(#1535): support non-root images properly
33
+ ssh_port = DSTACK_RUNNER_SSH_PORT
34
+ job_submission = jobs_services.job_model_to_job_submission(job)
35
+ jrd = job_submission.job_runtime_data
36
+ if jrd is not None and jrd.ports is not None:
37
+ ssh_port = jrd.ports.get(ssh_port, ssh_port)
38
+ ssh_proxy = SSHConnectionParams(
39
+ hostname=jpd.hostname,
40
+ username=jpd.username,
41
+ port=jpd.ssh_port,
42
+ )
43
+ if jpd.backend == BackendType.LOCAL:
44
+ ssh_proxy = None
45
+ ssh_head_proxy: Optional[SSHConnectionParams] = None
46
+ ssh_head_proxy_private_key: Optional[str] = None
47
+ instance = get_or_error(job.instance)
48
+ if instance.remote_connection_info is not None:
49
+ rci = RemoteConnectionInfo.__response__.parse_raw(instance.remote_connection_info)
50
+ if rci.ssh_proxy is not None:
51
+ ssh_head_proxy = rci.ssh_proxy
52
+ ssh_head_proxy_private_key = get_or_error(rci.ssh_proxy_keys)[0].private
53
+ ssh_proxies = []
54
+ if ssh_head_proxy is not None:
55
+ ssh_head_proxy_private_key = get_or_error(ssh_head_proxy_private_key)
56
+ ssh_proxies.append((ssh_head_proxy, FileContent(ssh_head_proxy_private_key)))
57
+ if ssh_proxy is not None:
58
+ ssh_proxies.append((ssh_proxy, None))
59
+ return SSHTunnel(
60
+ destination=ssh_destination,
61
+ port=ssh_port,
62
+ ssh_proxies=ssh_proxies,
63
+ identity=FileContent(instance.project.ssh_private_key),
64
+ forwarded_sockets=forwarded_sockets,
65
+ options=options,
66
+ )
@@ -1,3 +1,7 @@
1
+ """
2
+ Environment variables read by the dstack server. Documented in reference/environment-variables.md
3
+ """
4
+
1
5
  import os
2
6
  import warnings
3
7
  from pathlib import Path
@@ -50,6 +54,8 @@ SERVER_BACKGROUND_PROCESSING_ENABLED = not SERVER_BACKGROUND_PROCESSING_DISABLED
50
54
  SERVER_EXECUTOR_MAX_WORKERS = int(os.getenv("DSTACK_SERVER_EXECUTOR_MAX_WORKERS", 128))
51
55
 
52
56
  MAX_OFFERS_TRIED = int(os.getenv("DSTACK_SERVER_MAX_OFFERS_TRIED", 25))
57
+ MAX_PROBES_PER_JOB = int(os.getenv("DSTACK_SERVER_MAX_PROBES_PER_JOB", 10))
58
+ MAX_PROBE_TIMEOUT = int(os.getenv("DSTACK_SERVER_MAX_PROBE_TIMEOUT", 60 * 5))
53
59
 
54
60
  SERVER_CONFIG_DISABLED = os.getenv("DSTACK_SERVER_CONFIG_DISABLED") is not None
55
61
  SERVER_CONFIG_ENABLED = not SERVER_CONFIG_DISABLED
@@ -87,6 +93,13 @@ SERVER_METRICS_FINISHED_TTL_SECONDS = int(
87
93
  os.getenv("DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS", 7 * 24 * 3600)
88
94
  )
89
95
 
96
+ SERVER_INSTANCE_HEALTH_TTL_SECONDS = int(
97
+ os.getenv("DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS", 7 * 24 * 3600)
98
+ )
99
+ SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS = int(
100
+ os.getenv("DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS", 60)
101
+ )
102
+
90
103
  SERVER_KEEP_SHIM_TASKS = os.getenv("DSTACK_SERVER_KEEP_SHIM_TASKS") is not None
91
104
 
92
105
  DEFAULT_PROJECT_NAME = "main"
@@ -1,3 +1,3 @@
1
1
  <!doctype html><html lang="en"><head><meta charset="utf-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><title>dstack</title><meta name="description" content="Get GPUs at the best prices and availability from a wide range of providers. No cloud account of your own is required.
2
2
  "/><link rel="preconnect" href="https://fonts.googleapis.com"><link rel="preconnect" href="https://fonts.gstatic.com" crossorigin><link href="https://fonts.googleapis.com/css2?family=Roboto:ital,wght@0,100;0,300;0,400;0,500;0,700;0,900;1,100;1,300;1,400;1,500;1,700;1,900&display=swap" rel="stylesheet"><meta name="og:title" content="dstack"><meta name="og:type" content="article"><meta name="og:image" content="/splash_thumbnail.png"><meta name="og:description" content="Get GPUs at the best prices and availability from a wide range of providers. No cloud account of your own is required.
3
- "><link rel="icon" type="image/x-icon" href="/assets/favicon.ico"><link rel="icon" type="image/png" sizes="16x16" href="/assets/favicon-16x16.png"><link rel="icon" type="image/png" sizes="32x32" href="/assets/favicon-32x32.png"><link rel="icon" type="image/png" sizes="48x48" href="/assets/favicon-48x48.png"><link rel="manifest" href="/assets/manifest.webmanifest"><meta name="mobile-web-app-capable" content="yes"><meta name="theme-color" content="#fff"><meta name="application-name" content="dstackai"><link rel="apple-touch-icon" sizes="57x57" href="/assets/apple-touch-icon-57x57.png"><link rel="apple-touch-icon" sizes="60x60" href="/assets/apple-touch-icon-60x60.png"><link rel="apple-touch-icon" sizes="72x72" href="/assets/apple-touch-icon-72x72.png"><link rel="apple-touch-icon" sizes="76x76" href="/assets/apple-touch-icon-76x76.png"><link rel="apple-touch-icon" sizes="114x114" href="/assets/apple-touch-icon-114x114.png"><link rel="apple-touch-icon" sizes="120x120" href="/assets/apple-touch-icon-120x120.png"><link rel="apple-touch-icon" sizes="144x144" href="/assets/apple-touch-icon-144x144.png"><link rel="apple-touch-icon" sizes="152x152" href="/assets/apple-touch-icon-152x152.png"><link rel="apple-touch-icon" sizes="167x167" href="/assets/apple-touch-icon-167x167.png"><link rel="apple-touch-icon" sizes="180x180" href="/assets/apple-touch-icon-180x180.png"><link rel="apple-touch-icon" sizes="1024x1024" href="/assets/apple-touch-icon-1024x1024.png"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"><meta name="apple-mobile-web-app-title" content="dstackai"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-640x1136.png"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1136x640.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-750x1334.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1334x750.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1125x2436.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2436x1125.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1170x2532.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2532x1170.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1179x2556.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2556x1179.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-828x1792.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1792x828.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2688.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2688x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2208.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2208x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1284x2778.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2778x1284.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1290x2796.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2796x1290.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1488x2266.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2266x1488.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1536x2048.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2048x1536.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1620x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1620.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1640x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1640.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2388.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2388x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2224.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2224x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-2048x2732.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2732x2048.png"><meta name="msapplication-TileColor" content="#fff"><meta name="msapplication-TileImage" content="/assets/mstile-144x144.png"><meta name="msapplication-config" content="/assets/browserconfig.xml"><link rel="yandex-tableau-widget" href="/assets/yandex-browser-manifest.json"><script defer="defer" src="/main-39a767528976f8078166.js"></script><link href="/main-8f9ee218d3eb45989682.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div class="b-page-header" id="header"></div><div id="root"></div></body></html>
3
+ "><link rel="icon" type="image/x-icon" href="/assets/favicon.ico"><link rel="icon" type="image/png" sizes="16x16" href="/assets/favicon-16x16.png"><link rel="icon" type="image/png" sizes="32x32" href="/assets/favicon-32x32.png"><link rel="icon" type="image/png" sizes="48x48" href="/assets/favicon-48x48.png"><link rel="manifest" href="/assets/manifest.webmanifest"><meta name="mobile-web-app-capable" content="yes"><meta name="theme-color" content="#fff"><meta name="application-name" content="dstackai"><link rel="apple-touch-icon" sizes="57x57" href="/assets/apple-touch-icon-57x57.png"><link rel="apple-touch-icon" sizes="60x60" href="/assets/apple-touch-icon-60x60.png"><link rel="apple-touch-icon" sizes="72x72" href="/assets/apple-touch-icon-72x72.png"><link rel="apple-touch-icon" sizes="76x76" href="/assets/apple-touch-icon-76x76.png"><link rel="apple-touch-icon" sizes="114x114" href="/assets/apple-touch-icon-114x114.png"><link rel="apple-touch-icon" sizes="120x120" href="/assets/apple-touch-icon-120x120.png"><link rel="apple-touch-icon" sizes="144x144" href="/assets/apple-touch-icon-144x144.png"><link rel="apple-touch-icon" sizes="152x152" href="/assets/apple-touch-icon-152x152.png"><link rel="apple-touch-icon" sizes="167x167" href="/assets/apple-touch-icon-167x167.png"><link rel="apple-touch-icon" sizes="180x180" href="/assets/apple-touch-icon-180x180.png"><link rel="apple-touch-icon" sizes="1024x1024" href="/assets/apple-touch-icon-1024x1024.png"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"><meta name="apple-mobile-web-app-title" content="dstackai"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-640x1136.png"><link rel="apple-touch-startup-image" media="(device-width: 320px) and (device-height: 568px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1136x640.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-750x1334.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 667px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1334x750.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1125x2436.png"><link rel="apple-touch-startup-image" media="(device-width: 375px) and (device-height: 812px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2436x1125.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1170x2532.png"><link rel="apple-touch-startup-image" media="(device-width: 390px) and (device-height: 844px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2532x1170.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1179x2556.png"><link rel="apple-touch-startup-image" media="(device-width: 393px) and (device-height: 852px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2556x1179.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-828x1792.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-1792x828.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2688.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 896px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2688x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1242x2208.png"><link rel="apple-touch-startup-image" media="(device-width: 414px) and (device-height: 736px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2208x1242.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1284x2778.png"><link rel="apple-touch-startup-image" media="(device-width: 428px) and (device-height: 926px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2778x1284.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1290x2796.png"><link rel="apple-touch-startup-image" media="(device-width: 430px) and (device-height: 932px) and (-webkit-device-pixel-ratio: 3) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2796x1290.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1488x2266.png"><link rel="apple-touch-startup-image" media="(device-width: 744px) and (device-height: 1133px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2266x1488.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1536x2048.png"><link rel="apple-touch-startup-image" media="(device-width: 768px) and (device-height: 1024px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2048x1536.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1620x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 810px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1620.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1640x2160.png"><link rel="apple-touch-startup-image" media="(device-width: 820px) and (device-height: 1080px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2160x1640.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2388.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1194px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2388x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-1668x2224.png"><link rel="apple-touch-startup-image" media="(device-width: 834px) and (device-height: 1112px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2224x1668.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: portrait)" href="/assets/apple-touch-startup-image-2048x2732.png"><link rel="apple-touch-startup-image" media="(device-width: 1024px) and (device-height: 1366px) and (-webkit-device-pixel-ratio: 2) and (orientation: landscape)" href="/assets/apple-touch-startup-image-2732x2048.png"><meta name="msapplication-TileColor" content="#fff"><meta name="msapplication-TileImage" content="/assets/mstile-144x144.png"><meta name="msapplication-config" content="/assets/browserconfig.xml"><link rel="yandex-tableau-widget" href="/assets/yandex-browser-manifest.json"><script defer="defer" src="/main-cc067b7fd1a8f33f97da.js"></script><link href="/main-03e818b110e1d5705378.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div class="b-page-header" id="header"></div><div id="root"></div></body></html>