dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (93) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -3
  2. dstack/_internal/cli/services/configurators/__init__.py +8 -0
  3. dstack/_internal/cli/services/configurators/fleet.py +1 -1
  4. dstack/_internal/cli/services/configurators/gateway.py +1 -1
  5. dstack/_internal/cli/services/configurators/run.py +11 -1
  6. dstack/_internal/cli/services/configurators/volume.py +1 -1
  7. dstack/_internal/cli/utils/common.py +48 -5
  8. dstack/_internal/cli/utils/fleet.py +5 -5
  9. dstack/_internal/cli/utils/run.py +32 -0
  10. dstack/_internal/core/backends/__init__.py +0 -65
  11. dstack/_internal/core/backends/configurators.py +9 -0
  12. dstack/_internal/core/backends/features.py +64 -0
  13. dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
  14. dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
  15. dstack/_internal/core/backends/hotaisle/backend.py +16 -0
  16. dstack/_internal/core/backends/hotaisle/compute.py +225 -0
  17. dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
  18. dstack/_internal/core/backends/hotaisle/models.py +45 -0
  19. dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
  20. dstack/_internal/core/backends/models.py +8 -0
  21. dstack/_internal/core/compatibility/fleets.py +2 -0
  22. dstack/_internal/core/compatibility/runs.py +12 -0
  23. dstack/_internal/core/models/backends/base.py +2 -0
  24. dstack/_internal/core/models/configurations.py +139 -1
  25. dstack/_internal/core/models/health.py +28 -0
  26. dstack/_internal/core/models/instances.py +2 -0
  27. dstack/_internal/core/models/logs.py +2 -1
  28. dstack/_internal/core/models/profiles.py +37 -0
  29. dstack/_internal/core/models/runs.py +21 -1
  30. dstack/_internal/core/services/ssh/tunnel.py +7 -0
  31. dstack/_internal/server/app.py +26 -10
  32. dstack/_internal/server/background/__init__.py +9 -6
  33. dstack/_internal/server/background/tasks/process_fleets.py +52 -38
  34. dstack/_internal/server/background/tasks/process_gateways.py +2 -2
  35. dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
  36. dstack/_internal/server/background/tasks/process_instances.py +168 -103
  37. dstack/_internal/server/background/tasks/process_metrics.py +9 -2
  38. dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
  39. dstack/_internal/server/background/tasks/process_probes.py +164 -0
  40. dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
  41. dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
  42. dstack/_internal/server/background/tasks/process_runs.py +84 -34
  43. dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
  44. dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
  45. dstack/_internal/server/background/tasks/process_volumes.py +4 -1
  46. dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
  47. dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
  48. dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
  49. dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
  50. dstack/_internal/server/models.py +57 -16
  51. dstack/_internal/server/routers/instances.py +33 -5
  52. dstack/_internal/server/schemas/health/dcgm.py +56 -0
  53. dstack/_internal/server/schemas/instances.py +32 -0
  54. dstack/_internal/server/schemas/runner.py +5 -0
  55. dstack/_internal/server/services/fleets.py +19 -10
  56. dstack/_internal/server/services/gateways/__init__.py +17 -17
  57. dstack/_internal/server/services/instances.py +113 -15
  58. dstack/_internal/server/services/jobs/__init__.py +18 -13
  59. dstack/_internal/server/services/jobs/configurators/base.py +26 -0
  60. dstack/_internal/server/services/logging.py +4 -2
  61. dstack/_internal/server/services/logs/aws.py +13 -1
  62. dstack/_internal/server/services/logs/gcp.py +16 -1
  63. dstack/_internal/server/services/offers.py +3 -3
  64. dstack/_internal/server/services/probes.py +6 -0
  65. dstack/_internal/server/services/projects.py +51 -19
  66. dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
  67. dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
  68. dstack/_internal/server/services/runner/client.py +52 -20
  69. dstack/_internal/server/services/runner/ssh.py +4 -4
  70. dstack/_internal/server/services/runs.py +115 -39
  71. dstack/_internal/server/services/services/__init__.py +4 -1
  72. dstack/_internal/server/services/ssh.py +66 -0
  73. dstack/_internal/server/services/users.py +2 -3
  74. dstack/_internal/server/services/volumes.py +11 -11
  75. dstack/_internal/server/settings.py +16 -0
  76. dstack/_internal/server/statics/index.html +1 -1
  77. dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
  78. dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
  79. dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
  80. dstack/_internal/server/testing/common.py +51 -0
  81. dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
  82. dstack/_internal/server/utils/sentry_utils.py +12 -0
  83. dstack/_internal/settings.py +3 -0
  84. dstack/_internal/utils/common.py +15 -0
  85. dstack/_internal/utils/cron.py +5 -0
  86. dstack/api/server/__init__.py +1 -1
  87. dstack/version.py +1 -1
  88. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
  89. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
  90. /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
  91. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
  92. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
  93. {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,225 @@
1
+ import shlex
2
+ import subprocess
3
+ import tempfile
4
+ from threading import Thread
5
+ from typing import List, Optional
6
+
7
+ import gpuhunt
8
+ from gpuhunt.providers.hotaisle import HotAisleProvider
9
+
10
+ from dstack._internal.core.backends.base.compute import (
11
+ Compute,
12
+ ComputeWithCreateInstanceSupport,
13
+ get_shim_commands,
14
+ )
15
+ from dstack._internal.core.backends.base.offers import get_catalog_offers
16
+ from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
17
+ from dstack._internal.core.backends.hotaisle.models import HotAisleConfig
18
+ from dstack._internal.core.models.backends.base import BackendType
19
+ from dstack._internal.core.models.common import CoreModel
20
+ from dstack._internal.core.models.instances import (
21
+ InstanceAvailability,
22
+ InstanceConfiguration,
23
+ InstanceOfferWithAvailability,
24
+ )
25
+ from dstack._internal.core.models.placement import PlacementGroup
26
+ from dstack._internal.core.models.runs import JobProvisioningData, Requirements
27
+ from dstack._internal.utils.logging import get_logger
28
+
29
+ logger = get_logger(__name__)
30
+
31
+ MAX_INSTANCE_NAME_LEN = 60
32
+
33
+
34
+ INSTANCE_TYPE_SPECS = {
35
+ "1x MI300X 8x Xeon Platinum 8462Y+": {
36
+ "cpu_model": "Xeon Platinum 8462Y+",
37
+ "cpu_frequency": 2800000000,
38
+ "cpu_manufacturer": "Intel",
39
+ },
40
+ "1x MI300X 13x Xeon Platinum 8470": {
41
+ "cpu_model": "Xeon Platinum 8470",
42
+ "cpu_frequency": 2000000000,
43
+ "cpu_manufacturer": "Intel",
44
+ },
45
+ }
46
+
47
+
48
+ class HotAisleCompute(
49
+ ComputeWithCreateInstanceSupport,
50
+ Compute,
51
+ ):
52
+ def __init__(self, config: HotAisleConfig):
53
+ super().__init__()
54
+ self.config = config
55
+ self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle)
56
+ self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False)
57
+ self.catalog.add_provider(
58
+ HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
59
+ )
60
+
61
+ def get_offers(
62
+ self, requirements: Optional[Requirements] = None
63
+ ) -> List[InstanceOfferWithAvailability]:
64
+ offers = get_catalog_offers(
65
+ backend=BackendType.HOTAISLE,
66
+ locations=self.config.regions or None,
67
+ requirements=requirements,
68
+ catalog=self.catalog,
69
+ )
70
+
71
+ supported_offers = []
72
+ for offer in offers:
73
+ if offer.instance.name in INSTANCE_TYPE_SPECS:
74
+ supported_offers.append(
75
+ InstanceOfferWithAvailability(
76
+ **offer.dict(), availability=InstanceAvailability.AVAILABLE
77
+ )
78
+ )
79
+ else:
80
+ logger.warning(
81
+ f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
82
+ )
83
+
84
+ return supported_offers
85
+
86
+ def get_payload_from_offer(self, instance_type) -> dict:
87
+ instance_type_name = instance_type.name
88
+ cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
89
+ cpu_cores = instance_type.resources.cpus
90
+
91
+ return {
92
+ "cpu_cores": cpu_cores,
93
+ "cpus": {
94
+ "count": 1,
95
+ "manufacturer": cpu_specs["cpu_manufacturer"],
96
+ "model": cpu_specs["cpu_model"],
97
+ "cores": cpu_cores,
98
+ "frequency": cpu_specs["cpu_frequency"],
99
+ },
100
+ "disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
101
+ "ram_capacity": instance_type.resources.memory_mib * 1024**2,
102
+ "gpus": [
103
+ {
104
+ "count": len(instance_type.resources.gpus),
105
+ "manufacturer": instance_type.resources.gpus[0].vendor,
106
+ "model": instance_type.resources.gpus[0].name,
107
+ }
108
+ ],
109
+ }
110
+
111
+ def create_instance(
112
+ self,
113
+ instance_offer: InstanceOfferWithAvailability,
114
+ instance_config: InstanceConfiguration,
115
+ placement_group: Optional[PlacementGroup],
116
+ ) -> JobProvisioningData:
117
+ project_ssh_key = instance_config.ssh_keys[0]
118
+ self.api_client.upload_ssh_key(project_ssh_key.public)
119
+ vm_payload = self.get_payload_from_offer(instance_offer.instance)
120
+ vm_data = self.api_client.create_virtual_machine(vm_payload)
121
+ return JobProvisioningData(
122
+ backend=instance_offer.backend,
123
+ instance_type=instance_offer.instance,
124
+ instance_id=vm_data["name"],
125
+ hostname=None,
126
+ internal_ip=None,
127
+ region=instance_offer.region,
128
+ price=instance_offer.price,
129
+ username="hotaisle",
130
+ ssh_port=22,
131
+ dockerized=True,
132
+ ssh_proxy=None,
133
+ backend_data=HotAisleInstanceBackendData(
134
+ ip_address=vm_data["ip_address"], vm_id=vm_data["name"]
135
+ ).json(),
136
+ )
137
+
138
+ def update_provisioning_data(
139
+ self,
140
+ provisioning_data: JobProvisioningData,
141
+ project_ssh_public_key: str,
142
+ project_ssh_private_key: str,
143
+ ):
144
+ vm_state = self.api_client.get_vm_state(provisioning_data.instance_id)
145
+ if vm_state == "running":
146
+ if provisioning_data.hostname is None and provisioning_data.backend_data:
147
+ backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data)
148
+ provisioning_data.hostname = backend_data.ip_address
149
+ commands = get_shim_commands(
150
+ authorized_keys=[project_ssh_public_key],
151
+ arch=provisioning_data.instance_type.resources.cpu_arch,
152
+ )
153
+ launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands))
154
+ thread = Thread(
155
+ target=_start_runner,
156
+ kwargs={
157
+ "hostname": provisioning_data.hostname,
158
+ "project_ssh_private_key": project_ssh_private_key,
159
+ "launch_command": launch_command,
160
+ },
161
+ daemon=True,
162
+ )
163
+ thread.start()
164
+
165
+ def terminate_instance(
166
+ self, instance_id: str, region: str, backend_data: Optional[str] = None
167
+ ):
168
+ vm_name = instance_id
169
+ self.api_client.terminate_virtual_machine(vm_name)
170
+
171
+
172
+ def _start_runner(
173
+ hostname: str,
174
+ project_ssh_private_key: str,
175
+ launch_command: str,
176
+ ):
177
+ _launch_runner(
178
+ hostname=hostname,
179
+ ssh_private_key=project_ssh_private_key,
180
+ launch_command=launch_command,
181
+ )
182
+
183
+
184
+ def _launch_runner(
185
+ hostname: str,
186
+ ssh_private_key: str,
187
+ launch_command: str,
188
+ ):
189
+ daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
190
+ _run_ssh_command(
191
+ hostname=hostname,
192
+ ssh_private_key=ssh_private_key,
193
+ command=daemonized_command,
194
+ )
195
+
196
+
197
+ def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
198
+ with tempfile.NamedTemporaryFile("w+", 0o600) as f:
199
+ f.write(ssh_private_key)
200
+ f.flush()
201
+ subprocess.run(
202
+ [
203
+ "ssh",
204
+ "-F",
205
+ "none",
206
+ "-o",
207
+ "StrictHostKeyChecking=no",
208
+ "-i",
209
+ f.name,
210
+ f"hotaisle@{hostname}",
211
+ command,
212
+ ],
213
+ stdout=subprocess.DEVNULL,
214
+ stderr=subprocess.DEVNULL,
215
+ )
216
+
217
+
218
+ class HotAisleInstanceBackendData(CoreModel):
219
+ ip_address: str
220
+ vm_id: Optional[str] = None
221
+
222
+ @classmethod
223
+ def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
224
+ assert raw is not None
225
+ return cls.__response__.parse_raw(raw)
@@ -0,0 +1,60 @@
1
+ import json
2
+
3
+ from dstack._internal.core.backends.base.configurator import (
4
+ BackendRecord,
5
+ Configurator,
6
+ )
7
+ from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
8
+ from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend
9
+ from dstack._internal.core.backends.hotaisle.models import (
10
+ AnyHotAisleBackendConfig,
11
+ AnyHotAisleCreds,
12
+ HotAisleBackendConfig,
13
+ HotAisleBackendConfigWithCreds,
14
+ HotAisleConfig,
15
+ HotAisleCreds,
16
+ HotAisleStoredConfig,
17
+ )
18
+ from dstack._internal.core.models.backends.base import (
19
+ BackendType,
20
+ )
21
+
22
+
23
+ class HotAisleConfigurator(Configurator):
24
+ TYPE = BackendType.HOTAISLE
25
+ BACKEND_CLASS = HotAisleBackend
26
+
27
+ def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool):
28
+ self._validate_creds(config.creds, config.team_handle)
29
+
30
+ def create_backend(
31
+ self, project_name: str, config: HotAisleBackendConfigWithCreds
32
+ ) -> BackendRecord:
33
+ return BackendRecord(
34
+ config=HotAisleStoredConfig(
35
+ **HotAisleBackendConfig.__response__.parse_obj(config).dict()
36
+ ).json(),
37
+ auth=HotAisleCreds.parse_obj(config.creds).json(),
38
+ )
39
+
40
+ def get_backend_config(
41
+ self, record: BackendRecord, include_creds: bool
42
+ ) -> AnyHotAisleBackendConfig:
43
+ config = self._get_config(record)
44
+ if include_creds:
45
+ return HotAisleBackendConfigWithCreds.__response__.parse_obj(config)
46
+ return HotAisleBackendConfig.__response__.parse_obj(config)
47
+
48
+ def get_backend(self, record: BackendRecord) -> HotAisleBackend:
49
+ config = self._get_config(record)
50
+ return HotAisleBackend(config=config)
51
+
52
+ def _get_config(self, record: BackendRecord) -> HotAisleConfig:
53
+ return HotAisleConfig.__response__(
54
+ **json.loads(record.config),
55
+ creds=HotAisleCreds.parse_raw(record.auth),
56
+ )
57
+
58
+ def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str):
59
+ api_client = HotAisleAPIClient(creds.api_key, team_handle)
60
+ api_client.validate_api_key()
@@ -0,0 +1,45 @@
1
+ from typing import Annotated, List, Literal, Optional, Union
2
+
3
+ from pydantic import Field
4
+
5
+ from dstack._internal.core.models.common import CoreModel
6
+
7
+
8
+ class HotAisleAPIKeyCreds(CoreModel):
9
+ type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
10
+ api_key: Annotated[str, Field(description="The Hot Aisle API key")]
11
+
12
+
13
+ AnyHotAisleCreds = HotAisleAPIKeyCreds
14
+ HotAisleCreds = AnyHotAisleCreds
15
+
16
+
17
+ class HotAisleBackendConfig(CoreModel):
18
+ type: Annotated[
19
+ Literal["hotaisle"],
20
+ Field(description="The type of backend"),
21
+ ] = "hotaisle"
22
+ team_handle: Annotated[str, Field(description="The Hot Aisle team handle")]
23
+ regions: Annotated[
24
+ Optional[List[str]],
25
+ Field(description="The list of Hot Aisle regions. Omit to use all regions"),
26
+ ] = None
27
+
28
+
29
+ class HotAisleBackendConfigWithCreds(HotAisleBackendConfig):
30
+ creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
31
+
32
+
33
+ AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds]
34
+
35
+
36
+ class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig):
37
+ creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
38
+
39
+
40
+ class HotAisleStoredConfig(HotAisleBackendConfig):
41
+ pass
42
+
43
+
44
+ class HotAisleConfig(HotAisleStoredConfig):
45
+ creds: AnyHotAisleCreds
@@ -206,10 +206,11 @@ def _launch_runner(
206
206
  ssh_private_key: str,
207
207
  launch_command: str,
208
208
  ):
209
+ daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
209
210
  _run_ssh_command(
210
211
  hostname=hostname,
211
212
  ssh_private_key=ssh_private_key,
212
- command=launch_command,
213
+ command=daemonized_command,
213
214
  )
214
215
 
215
216
 
@@ -29,6 +29,11 @@ from dstack._internal.core.backends.gcp.models import (
29
29
  GCPBackendConfigWithCreds,
30
30
  GCPBackendFileConfigWithCreds,
31
31
  )
32
+ from dstack._internal.core.backends.hotaisle.models import (
33
+ HotAisleBackendConfig,
34
+ HotAisleBackendConfigWithCreds,
35
+ HotAisleBackendFileConfigWithCreds,
36
+ )
32
37
  from dstack._internal.core.backends.kubernetes.models import (
33
38
  KubernetesBackendConfig,
34
39
  KubernetesBackendConfigWithCreds,
@@ -73,6 +78,7 @@ AnyBackendConfigWithoutCreds = Union[
73
78
  CudoBackendConfig,
74
79
  DataCrunchBackendConfig,
75
80
  GCPBackendConfig,
81
+ HotAisleBackendConfig,
76
82
  KubernetesBackendConfig,
77
83
  LambdaBackendConfig,
78
84
  NebiusBackendConfig,
@@ -95,6 +101,7 @@ AnyBackendConfigWithCreds = Union[
95
101
  CudoBackendConfigWithCreds,
96
102
  DataCrunchBackendConfigWithCreds,
97
103
  GCPBackendConfigWithCreds,
104
+ HotAisleBackendConfigWithCreds,
98
105
  KubernetesBackendConfigWithCreds,
99
106
  LambdaBackendConfigWithCreds,
100
107
  OCIBackendConfigWithCreds,
@@ -116,6 +123,7 @@ AnyBackendFileConfigWithCreds = Union[
116
123
  CudoBackendConfigWithCreds,
117
124
  DataCrunchBackendConfigWithCreds,
118
125
  GCPBackendFileConfigWithCreds,
126
+ HotAisleBackendFileConfigWithCreds,
119
127
  KubernetesBackendFileConfigWithCreds,
120
128
  LambdaBackendConfigWithCreds,
121
129
  OCIBackendConfigWithCreds,
@@ -57,6 +57,8 @@ def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDic
57
57
  profile_excludes.add("startup_order")
58
58
  if profile.stop_criteria is None:
59
59
  profile_excludes.add("stop_criteria")
60
+ if profile.schedule is None:
61
+ profile_excludes.add("schedule")
60
62
  if configuration_excludes:
61
63
  spec_excludes["configuration"] = configuration_excludes
62
64
  if profile_excludes:
@@ -53,6 +53,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
53
53
  job_submissions_excludes["exit_status"] = True
54
54
  if all(js.deployment_num == 0 for js in job_submissions):
55
55
  job_submissions_excludes["deployment_num"] = True
56
+ if all(not js.probes for js in job_submissions):
57
+ job_submissions_excludes["probes"] = True
56
58
  latest_job_submission = current_resource.latest_job_submission
57
59
  if latest_job_submission is not None:
58
60
  latest_job_submission_excludes: IncludeExcludeDictType = {}
@@ -69,6 +71,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
69
71
  latest_job_submission_excludes["exit_status"] = True
70
72
  if latest_job_submission.deployment_num == 0:
71
73
  latest_job_submission_excludes["deployment_num"] = True
74
+ if not latest_job_submission.probes:
75
+ latest_job_submission_excludes["probes"] = True
72
76
  return {"plan": apply_plan_excludes}
73
77
 
74
78
 
@@ -120,12 +124,18 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
120
124
  profile_excludes.add("startup_order")
121
125
  if configuration.stop_criteria is None:
122
126
  configuration_excludes["stop_criteria"] = True
127
+ if isinstance(configuration, ServiceConfiguration) and not configuration.probes:
128
+ configuration_excludes["probes"] = True
123
129
  if profile is not None and profile.stop_criteria is None:
124
130
  profile_excludes.add("stop_criteria")
125
131
  if not configuration.files:
126
132
  configuration_excludes["files"] = True
127
133
  if not run_spec.file_archives:
128
134
  spec_excludes["file_archives"] = True
135
+ if configuration.schedule is None:
136
+ configuration_excludes["schedule"] = True
137
+ if profile is not None and profile.schedule is None:
138
+ profile_excludes.add("schedule")
129
139
 
130
140
  if configuration_excludes:
131
141
  spec_excludes["configuration"] = configuration_excludes
@@ -150,6 +160,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
150
160
  spec_excludes["file_archives"] = True
151
161
  if all(s.service_port is None for s in job_specs):
152
162
  spec_excludes["service_port"] = True
163
+ if all(not s.probes for s in job_specs):
164
+ spec_excludes["probes"] = True
153
165
 
154
166
  return spec_excludes
155
167
 
@@ -11,6 +11,7 @@ class BackendType(str, enum.Enum):
11
11
  DSTACK (BackendType): dstack Sky
12
12
  GCP (BackendType): Google Cloud Platform
13
13
  DATACRUNCH (BackendType): DataCrunch
14
+ HOTAISLE (BackendType): Hot Aisle
14
15
  KUBERNETES (BackendType): Kubernetes
15
16
  LAMBDA (BackendType): Lambda Cloud
16
17
  NEBIUS (BackendType): Nebius AI Cloud
@@ -28,6 +29,7 @@ class BackendType(str, enum.Enum):
28
29
  DATACRUNCH = "datacrunch"
29
30
  DSTACK = "dstack"
30
31
  GCP = "gcp"
32
+ HOTAISLE = "hotaisle"
31
33
  KUBERNETES = "kubernetes"
32
34
  LAMBDA = "lambda"
33
35
  LOCAL = "local"
@@ -14,11 +14,12 @@ from dstack._internal.core.models.envs import Env
14
14
  from dstack._internal.core.models.files import FilePathMapping
15
15
  from dstack._internal.core.models.fleets import FleetConfiguration
16
16
  from dstack._internal.core.models.gateways import GatewayConfiguration
17
- from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
17
+ from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
18
18
  from dstack._internal.core.models.resources import Range, ResourcesSpec
19
19
  from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
20
20
  from dstack._internal.core.models.unix import UnixUser
21
21
  from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
22
+ from dstack._internal.utils.common import has_duplicates
22
23
  from dstack._internal.utils.json_utils import (
23
24
  pydantic_orjson_dumps_with_indent,
24
25
  )
@@ -32,6 +33,14 @@ RUN_PRIOTIRY_MIN = 0
32
33
  RUN_PRIOTIRY_MAX = 100
33
34
  RUN_PRIORITY_DEFAULT = 0
34
35
  DEFAULT_REPO_DIR = "/workflow"
36
+ MIN_PROBE_TIMEOUT = 1
37
+ MIN_PROBE_INTERVAL = 1
38
+ DEFAULT_PROBE_URL = "/"
39
+ DEFAULT_PROBE_TIMEOUT = 10
40
+ DEFAULT_PROBE_INTERVAL = 15
41
+ DEFAULT_PROBE_READY_AFTER = 1
42
+ DEFAULT_PROBE_METHOD = "get"
43
+ MAX_PROBE_URL_LEN = 2048
35
44
 
36
45
 
37
46
  class RunConfigurationType(str, Enum):
@@ -162,6 +171,121 @@ class RateLimit(CoreModel):
162
171
  ] = 0
163
172
 
164
173
 
174
+ HTTPMethod = Literal["get", "post", "put", "delete", "patch", "head"]
175
+
176
+
177
+ class HTTPHeaderSpec(CoreModel):
178
+ name: Annotated[
179
+ str,
180
+ Field(
181
+ description="The name of the HTTP header",
182
+ min_length=1,
183
+ max_length=256,
184
+ ),
185
+ ]
186
+ value: Annotated[
187
+ str,
188
+ Field(
189
+ description="The value of the HTTP header",
190
+ min_length=1,
191
+ max_length=2048,
192
+ ),
193
+ ]
194
+
195
+
196
+ class ProbeConfig(CoreModel):
197
+ type: Literal["http"] # expect other probe types in the future, namely `exec`
198
+ url: Annotated[
199
+ Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`")
200
+ ] = None
201
+ method: Annotated[
202
+ Optional[HTTPMethod],
203
+ Field(
204
+ description=(
205
+ "The HTTP method to use for the probe (e.g., `get`, `post`, etc.)."
206
+ f" Defaults to `{DEFAULT_PROBE_METHOD}`"
207
+ )
208
+ ),
209
+ ] = None
210
+ headers: Annotated[
211
+ list[HTTPHeaderSpec],
212
+ Field(description="A list of HTTP headers to include in the request", max_items=16),
213
+ ] = []
214
+ body: Annotated[
215
+ Optional[str],
216
+ Field(
217
+ description="The HTTP request body to send with the probe",
218
+ min_length=1,
219
+ max_length=2048,
220
+ ),
221
+ ] = None
222
+ timeout: Annotated[
223
+ Optional[Union[int, str]],
224
+ Field(
225
+ description=(
226
+ f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`"
227
+ )
228
+ ),
229
+ ] = None
230
+ interval: Annotated[
231
+ Optional[Union[int, str]],
232
+ Field(
233
+ description=(
234
+ "Minimum amount of time between the end of one probe execution"
235
+ f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`"
236
+ )
237
+ ),
238
+ ] = None
239
+ ready_after: Annotated[
240
+ Optional[int],
241
+ Field(
242
+ ge=1,
243
+ description=(
244
+ "The number of consecutive successful probe executions required for the replica"
245
+ " to be considered ready. Used during rolling deployments."
246
+ f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`"
247
+ ),
248
+ ),
249
+ ] = None
250
+
251
+ @validator("timeout")
252
+ def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]:
253
+ if v is None:
254
+ return v
255
+ parsed = parse_duration(v)
256
+ if parsed < MIN_PROBE_TIMEOUT:
257
+ raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
258
+ return parsed
259
+
260
+ @validator("interval")
261
+ def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]:
262
+ if v is None:
263
+ return v
264
+ parsed = parse_duration(v)
265
+ if parsed < MIN_PROBE_INTERVAL:
266
+ raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
267
+ return parsed
268
+
269
+ @validator("url")
270
+ def validate_url(cls, v: Optional[str]) -> Optional[str]:
271
+ if v is None:
272
+ return v
273
+ if not v.startswith("/"):
274
+ raise ValueError("Must start with `/`")
275
+ if len(v) > MAX_PROBE_URL_LEN:
276
+ raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters")
277
+ if not v.isprintable():
278
+ raise ValueError("Cannot contain non-printable characters")
279
+ return v
280
+
281
+ @root_validator
282
+ def validate_body_matches_method(cls, values):
283
+ method: HTTPMethod = values["method"]
284
+ if values["body"] is not None and method in ["get", "head"]:
285
+ raise ValueError(f"Cannot set request body for the `{method}` method")
286
+ return values
287
+
288
+
165
289
  class BaseRunConfiguration(CoreModel):
166
290
  type: Literal["none"]
167
291
  name: Annotated[
@@ -448,6 +572,10 @@ class ServiceConfigurationParams(CoreModel):
448
572
  Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
449
573
  ] = None
450
574
  rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
575
+ probes: Annotated[
576
+ list[ProbeConfig],
577
+ Field(description="List of probes used to determine job health"),
578
+ ] = []
451
579
 
452
580
  @validator("port")
453
581
  def convert_port(cls, v) -> PortMapping:
@@ -511,6 +639,16 @@ class ServiceConfigurationParams(CoreModel):
511
639
  )
512
640
  return v
513
641
 
642
+ @validator("probes")
643
+ def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
644
+ if has_duplicates(v):
645
+ # Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
646
+ # https://github.com/pydantic/pydantic/issues/3765
647
+ # Because of the bug, our gen_schema_reference.py fails to determine the type of
648
+ # ServiceConfiguration.probes and insert the correct hyperlink.
649
+ raise ValueError("Probes must be unique")
650
+ return v
651
+
514
652
 
515
653
  class ServiceConfiguration(
516
654
  ProfileParams, BaseRunConfigurationWithCommands, ServiceConfigurationParams
@@ -0,0 +1,28 @@
1
+ from datetime import datetime
2
+ from enum import Enum
3
+
4
+ from dstack._internal.core.models.common import CoreModel
5
+
6
+
7
+ class HealthStatus(str, Enum):
8
+ HEALTHY = "healthy"
9
+ WARNING = "warning"
10
+ FAILURE = "failure"
11
+
12
+ def is_healthy(self) -> bool:
13
+ return self == self.HEALTHY
14
+
15
+ def is_failure(self) -> bool:
16
+ return self == self.FAILURE
17
+
18
+
19
+ class HealthEvent(CoreModel):
20
+ timestamp: datetime
21
+ status: HealthStatus
22
+ message: str
23
+
24
+
25
+ class HealthCheck(CoreModel):
26
+ collected_at: datetime
27
+ status: HealthStatus
28
+ events: list[HealthEvent]
@@ -9,6 +9,7 @@ from pydantic import root_validator
9
9
  from dstack._internal.core.models.backends.base import BackendType
10
10
  from dstack._internal.core.models.common import CoreModel
11
11
  from dstack._internal.core.models.envs import Env
12
+ from dstack._internal.core.models.health import HealthStatus
12
13
  from dstack._internal.core.models.volumes import Volume
13
14
  from dstack._internal.utils.common import pretty_resources
14
15
 
@@ -225,6 +226,7 @@ class Instance(CoreModel):
225
226
  hostname: Optional[str] = None
226
227
  status: InstanceStatus
227
228
  unreachable: bool = False
229
+ health_status: HealthStatus = HealthStatus.HEALTHY
228
230
  termination_reason: Optional[str] = None
229
231
  created: datetime.datetime
230
232
  region: Optional[str] = None
@@ -23,4 +23,5 @@ class LogEvent(CoreModel):
23
23
 
24
24
  class JobSubmissionLogs(CoreModel):
25
25
  logs: List[LogEvent]
26
- next_token: Optional[str]
26
+ external_url: Optional[str] = None
27
+ next_token: Optional[str] = None