dstack 0.19.20__py3-none-any.whl → 0.19.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dstack might be problematic. Click here for more details.
- dstack/_internal/cli/commands/apply.py +8 -3
- dstack/_internal/cli/services/configurators/__init__.py +8 -0
- dstack/_internal/cli/services/configurators/fleet.py +1 -1
- dstack/_internal/cli/services/configurators/gateway.py +1 -1
- dstack/_internal/cli/services/configurators/run.py +11 -1
- dstack/_internal/cli/services/configurators/volume.py +1 -1
- dstack/_internal/cli/utils/common.py +48 -5
- dstack/_internal/cli/utils/fleet.py +5 -5
- dstack/_internal/cli/utils/run.py +32 -0
- dstack/_internal/core/backends/__init__.py +0 -65
- dstack/_internal/core/backends/configurators.py +9 -0
- dstack/_internal/core/backends/features.py +64 -0
- dstack/_internal/core/backends/hotaisle/__init__.py +1 -0
- dstack/_internal/core/backends/hotaisle/api_client.py +109 -0
- dstack/_internal/core/backends/hotaisle/backend.py +16 -0
- dstack/_internal/core/backends/hotaisle/compute.py +225 -0
- dstack/_internal/core/backends/hotaisle/configurator.py +60 -0
- dstack/_internal/core/backends/hotaisle/models.py +45 -0
- dstack/_internal/core/backends/lambdalabs/compute.py +2 -1
- dstack/_internal/core/backends/models.py +8 -0
- dstack/_internal/core/compatibility/fleets.py +2 -0
- dstack/_internal/core/compatibility/runs.py +12 -0
- dstack/_internal/core/models/backends/base.py +2 -0
- dstack/_internal/core/models/configurations.py +139 -1
- dstack/_internal/core/models/health.py +28 -0
- dstack/_internal/core/models/instances.py +2 -0
- dstack/_internal/core/models/logs.py +2 -1
- dstack/_internal/core/models/profiles.py +37 -0
- dstack/_internal/core/models/runs.py +21 -1
- dstack/_internal/core/services/ssh/tunnel.py +7 -0
- dstack/_internal/server/app.py +26 -10
- dstack/_internal/server/background/__init__.py +9 -6
- dstack/_internal/server/background/tasks/process_fleets.py +52 -38
- dstack/_internal/server/background/tasks/process_gateways.py +2 -2
- dstack/_internal/server/background/tasks/process_idle_volumes.py +5 -4
- dstack/_internal/server/background/tasks/process_instances.py +168 -103
- dstack/_internal/server/background/tasks/process_metrics.py +9 -2
- dstack/_internal/server/background/tasks/process_placement_groups.py +2 -0
- dstack/_internal/server/background/tasks/process_probes.py +164 -0
- dstack/_internal/server/background/tasks/process_prometheus_metrics.py +14 -2
- dstack/_internal/server/background/tasks/process_running_jobs.py +142 -124
- dstack/_internal/server/background/tasks/process_runs.py +84 -34
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +12 -10
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +12 -4
- dstack/_internal/server/background/tasks/process_volumes.py +4 -1
- dstack/_internal/server/migrations/versions/25479f540245_add_probes.py +43 -0
- dstack/_internal/server/migrations/versions/50dd7ea98639_index_status_columns.py +55 -0
- dstack/_internal/server/migrations/versions/728b1488b1b4_add_instance_health.py +50 -0
- dstack/_internal/server/migrations/versions/ec02a26a256c_add_runmodel_next_triggered_at.py +38 -0
- dstack/_internal/server/models.py +57 -16
- dstack/_internal/server/routers/instances.py +33 -5
- dstack/_internal/server/schemas/health/dcgm.py +56 -0
- dstack/_internal/server/schemas/instances.py +32 -0
- dstack/_internal/server/schemas/runner.py +5 -0
- dstack/_internal/server/services/fleets.py +19 -10
- dstack/_internal/server/services/gateways/__init__.py +17 -17
- dstack/_internal/server/services/instances.py +113 -15
- dstack/_internal/server/services/jobs/__init__.py +18 -13
- dstack/_internal/server/services/jobs/configurators/base.py +26 -0
- dstack/_internal/server/services/logging.py +4 -2
- dstack/_internal/server/services/logs/aws.py +13 -1
- dstack/_internal/server/services/logs/gcp.py +16 -1
- dstack/_internal/server/services/offers.py +3 -3
- dstack/_internal/server/services/probes.py +6 -0
- dstack/_internal/server/services/projects.py +51 -19
- dstack/_internal/server/services/prometheus/client_metrics.py +3 -0
- dstack/_internal/server/services/prometheus/custom_metrics.py +2 -3
- dstack/_internal/server/services/runner/client.py +52 -20
- dstack/_internal/server/services/runner/ssh.py +4 -4
- dstack/_internal/server/services/runs.py +115 -39
- dstack/_internal/server/services/services/__init__.py +4 -1
- dstack/_internal/server/services/ssh.py +66 -0
- dstack/_internal/server/services/users.py +2 -3
- dstack/_internal/server/services/volumes.py +11 -11
- dstack/_internal/server/settings.py +16 -0
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-8f9ee218d3eb45989682.css → main-03e818b110e1d5705378.css} +1 -1
- dstack/_internal/server/statics/{main-39a767528976f8078166.js → main-cc067b7fd1a8f33f97da.js} +26 -15
- dstack/_internal/server/statics/{main-39a767528976f8078166.js.map → main-cc067b7fd1a8f33f97da.js.map} +1 -1
- dstack/_internal/server/testing/common.py +51 -0
- dstack/_internal/{core/backends/remote → server/utils}/provisioning.py +22 -17
- dstack/_internal/server/utils/sentry_utils.py +12 -0
- dstack/_internal/settings.py +3 -0
- dstack/_internal/utils/common.py +15 -0
- dstack/_internal/utils/cron.py +5 -0
- dstack/api/server/__init__.py +1 -1
- dstack/version.py +1 -1
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/METADATA +13 -22
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/RECORD +93 -75
- /dstack/_internal/{core/backends/remote → server/schemas/health}/__init__.py +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/WHEEL +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/entry_points.txt +0 -0
- {dstack-0.19.20.dist-info → dstack-0.19.22.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import shlex
|
|
2
|
+
import subprocess
|
|
3
|
+
import tempfile
|
|
4
|
+
from threading import Thread
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
import gpuhunt
|
|
8
|
+
from gpuhunt.providers.hotaisle import HotAisleProvider
|
|
9
|
+
|
|
10
|
+
from dstack._internal.core.backends.base.compute import (
|
|
11
|
+
Compute,
|
|
12
|
+
ComputeWithCreateInstanceSupport,
|
|
13
|
+
get_shim_commands,
|
|
14
|
+
)
|
|
15
|
+
from dstack._internal.core.backends.base.offers import get_catalog_offers
|
|
16
|
+
from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
|
|
17
|
+
from dstack._internal.core.backends.hotaisle.models import HotAisleConfig
|
|
18
|
+
from dstack._internal.core.models.backends.base import BackendType
|
|
19
|
+
from dstack._internal.core.models.common import CoreModel
|
|
20
|
+
from dstack._internal.core.models.instances import (
|
|
21
|
+
InstanceAvailability,
|
|
22
|
+
InstanceConfiguration,
|
|
23
|
+
InstanceOfferWithAvailability,
|
|
24
|
+
)
|
|
25
|
+
from dstack._internal.core.models.placement import PlacementGroup
|
|
26
|
+
from dstack._internal.core.models.runs import JobProvisioningData, Requirements
|
|
27
|
+
from dstack._internal.utils.logging import get_logger
|
|
28
|
+
|
|
29
|
+
logger = get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
MAX_INSTANCE_NAME_LEN = 60
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
INSTANCE_TYPE_SPECS = {
|
|
35
|
+
"1x MI300X 8x Xeon Platinum 8462Y+": {
|
|
36
|
+
"cpu_model": "Xeon Platinum 8462Y+",
|
|
37
|
+
"cpu_frequency": 2800000000,
|
|
38
|
+
"cpu_manufacturer": "Intel",
|
|
39
|
+
},
|
|
40
|
+
"1x MI300X 13x Xeon Platinum 8470": {
|
|
41
|
+
"cpu_model": "Xeon Platinum 8470",
|
|
42
|
+
"cpu_frequency": 2000000000,
|
|
43
|
+
"cpu_manufacturer": "Intel",
|
|
44
|
+
},
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class HotAisleCompute(
|
|
49
|
+
ComputeWithCreateInstanceSupport,
|
|
50
|
+
Compute,
|
|
51
|
+
):
|
|
52
|
+
def __init__(self, config: HotAisleConfig):
|
|
53
|
+
super().__init__()
|
|
54
|
+
self.config = config
|
|
55
|
+
self.api_client = HotAisleAPIClient(config.creds.api_key, config.team_handle)
|
|
56
|
+
self.catalog = gpuhunt.Catalog(balance_resources=False, auto_reload=False)
|
|
57
|
+
self.catalog.add_provider(
|
|
58
|
+
HotAisleProvider(api_key=config.creds.api_key, team_handle=config.team_handle)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def get_offers(
|
|
62
|
+
self, requirements: Optional[Requirements] = None
|
|
63
|
+
) -> List[InstanceOfferWithAvailability]:
|
|
64
|
+
offers = get_catalog_offers(
|
|
65
|
+
backend=BackendType.HOTAISLE,
|
|
66
|
+
locations=self.config.regions or None,
|
|
67
|
+
requirements=requirements,
|
|
68
|
+
catalog=self.catalog,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
supported_offers = []
|
|
72
|
+
for offer in offers:
|
|
73
|
+
if offer.instance.name in INSTANCE_TYPE_SPECS:
|
|
74
|
+
supported_offers.append(
|
|
75
|
+
InstanceOfferWithAvailability(
|
|
76
|
+
**offer.dict(), availability=InstanceAvailability.AVAILABLE
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
logger.warning(
|
|
81
|
+
f"Skipping unsupported Hot Aisle instance type: {offer.instance.name}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return supported_offers
|
|
85
|
+
|
|
86
|
+
def get_payload_from_offer(self, instance_type) -> dict:
|
|
87
|
+
instance_type_name = instance_type.name
|
|
88
|
+
cpu_specs = INSTANCE_TYPE_SPECS[instance_type_name]
|
|
89
|
+
cpu_cores = instance_type.resources.cpus
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
"cpu_cores": cpu_cores,
|
|
93
|
+
"cpus": {
|
|
94
|
+
"count": 1,
|
|
95
|
+
"manufacturer": cpu_specs["cpu_manufacturer"],
|
|
96
|
+
"model": cpu_specs["cpu_model"],
|
|
97
|
+
"cores": cpu_cores,
|
|
98
|
+
"frequency": cpu_specs["cpu_frequency"],
|
|
99
|
+
},
|
|
100
|
+
"disk_capacity": instance_type.resources.disk.size_mib * 1024**2,
|
|
101
|
+
"ram_capacity": instance_type.resources.memory_mib * 1024**2,
|
|
102
|
+
"gpus": [
|
|
103
|
+
{
|
|
104
|
+
"count": len(instance_type.resources.gpus),
|
|
105
|
+
"manufacturer": instance_type.resources.gpus[0].vendor,
|
|
106
|
+
"model": instance_type.resources.gpus[0].name,
|
|
107
|
+
}
|
|
108
|
+
],
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
def create_instance(
|
|
112
|
+
self,
|
|
113
|
+
instance_offer: InstanceOfferWithAvailability,
|
|
114
|
+
instance_config: InstanceConfiguration,
|
|
115
|
+
placement_group: Optional[PlacementGroup],
|
|
116
|
+
) -> JobProvisioningData:
|
|
117
|
+
project_ssh_key = instance_config.ssh_keys[0]
|
|
118
|
+
self.api_client.upload_ssh_key(project_ssh_key.public)
|
|
119
|
+
vm_payload = self.get_payload_from_offer(instance_offer.instance)
|
|
120
|
+
vm_data = self.api_client.create_virtual_machine(vm_payload)
|
|
121
|
+
return JobProvisioningData(
|
|
122
|
+
backend=instance_offer.backend,
|
|
123
|
+
instance_type=instance_offer.instance,
|
|
124
|
+
instance_id=vm_data["name"],
|
|
125
|
+
hostname=None,
|
|
126
|
+
internal_ip=None,
|
|
127
|
+
region=instance_offer.region,
|
|
128
|
+
price=instance_offer.price,
|
|
129
|
+
username="hotaisle",
|
|
130
|
+
ssh_port=22,
|
|
131
|
+
dockerized=True,
|
|
132
|
+
ssh_proxy=None,
|
|
133
|
+
backend_data=HotAisleInstanceBackendData(
|
|
134
|
+
ip_address=vm_data["ip_address"], vm_id=vm_data["name"]
|
|
135
|
+
).json(),
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def update_provisioning_data(
|
|
139
|
+
self,
|
|
140
|
+
provisioning_data: JobProvisioningData,
|
|
141
|
+
project_ssh_public_key: str,
|
|
142
|
+
project_ssh_private_key: str,
|
|
143
|
+
):
|
|
144
|
+
vm_state = self.api_client.get_vm_state(provisioning_data.instance_id)
|
|
145
|
+
if vm_state == "running":
|
|
146
|
+
if provisioning_data.hostname is None and provisioning_data.backend_data:
|
|
147
|
+
backend_data = HotAisleInstanceBackendData.load(provisioning_data.backend_data)
|
|
148
|
+
provisioning_data.hostname = backend_data.ip_address
|
|
149
|
+
commands = get_shim_commands(
|
|
150
|
+
authorized_keys=[project_ssh_public_key],
|
|
151
|
+
arch=provisioning_data.instance_type.resources.cpu_arch,
|
|
152
|
+
)
|
|
153
|
+
launch_command = "sudo sh -c " + shlex.quote(" && ".join(commands))
|
|
154
|
+
thread = Thread(
|
|
155
|
+
target=_start_runner,
|
|
156
|
+
kwargs={
|
|
157
|
+
"hostname": provisioning_data.hostname,
|
|
158
|
+
"project_ssh_private_key": project_ssh_private_key,
|
|
159
|
+
"launch_command": launch_command,
|
|
160
|
+
},
|
|
161
|
+
daemon=True,
|
|
162
|
+
)
|
|
163
|
+
thread.start()
|
|
164
|
+
|
|
165
|
+
def terminate_instance(
|
|
166
|
+
self, instance_id: str, region: str, backend_data: Optional[str] = None
|
|
167
|
+
):
|
|
168
|
+
vm_name = instance_id
|
|
169
|
+
self.api_client.terminate_virtual_machine(vm_name)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _start_runner(
|
|
173
|
+
hostname: str,
|
|
174
|
+
project_ssh_private_key: str,
|
|
175
|
+
launch_command: str,
|
|
176
|
+
):
|
|
177
|
+
_launch_runner(
|
|
178
|
+
hostname=hostname,
|
|
179
|
+
ssh_private_key=project_ssh_private_key,
|
|
180
|
+
launch_command=launch_command,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _launch_runner(
|
|
185
|
+
hostname: str,
|
|
186
|
+
ssh_private_key: str,
|
|
187
|
+
launch_command: str,
|
|
188
|
+
):
|
|
189
|
+
daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
|
|
190
|
+
_run_ssh_command(
|
|
191
|
+
hostname=hostname,
|
|
192
|
+
ssh_private_key=ssh_private_key,
|
|
193
|
+
command=daemonized_command,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _run_ssh_command(hostname: str, ssh_private_key: str, command: str):
|
|
198
|
+
with tempfile.NamedTemporaryFile("w+", 0o600) as f:
|
|
199
|
+
f.write(ssh_private_key)
|
|
200
|
+
f.flush()
|
|
201
|
+
subprocess.run(
|
|
202
|
+
[
|
|
203
|
+
"ssh",
|
|
204
|
+
"-F",
|
|
205
|
+
"none",
|
|
206
|
+
"-o",
|
|
207
|
+
"StrictHostKeyChecking=no",
|
|
208
|
+
"-i",
|
|
209
|
+
f.name,
|
|
210
|
+
f"hotaisle@{hostname}",
|
|
211
|
+
command,
|
|
212
|
+
],
|
|
213
|
+
stdout=subprocess.DEVNULL,
|
|
214
|
+
stderr=subprocess.DEVNULL,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class HotAisleInstanceBackendData(CoreModel):
|
|
219
|
+
ip_address: str
|
|
220
|
+
vm_id: Optional[str] = None
|
|
221
|
+
|
|
222
|
+
@classmethod
|
|
223
|
+
def load(cls, raw: Optional[str]) -> "HotAisleInstanceBackendData":
|
|
224
|
+
assert raw is not None
|
|
225
|
+
return cls.__response__.parse_raw(raw)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from dstack._internal.core.backends.base.configurator import (
|
|
4
|
+
BackendRecord,
|
|
5
|
+
Configurator,
|
|
6
|
+
)
|
|
7
|
+
from dstack._internal.core.backends.hotaisle.api_client import HotAisleAPIClient
|
|
8
|
+
from dstack._internal.core.backends.hotaisle.backend import HotAisleBackend
|
|
9
|
+
from dstack._internal.core.backends.hotaisle.models import (
|
|
10
|
+
AnyHotAisleBackendConfig,
|
|
11
|
+
AnyHotAisleCreds,
|
|
12
|
+
HotAisleBackendConfig,
|
|
13
|
+
HotAisleBackendConfigWithCreds,
|
|
14
|
+
HotAisleConfig,
|
|
15
|
+
HotAisleCreds,
|
|
16
|
+
HotAisleStoredConfig,
|
|
17
|
+
)
|
|
18
|
+
from dstack._internal.core.models.backends.base import (
|
|
19
|
+
BackendType,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HotAisleConfigurator(Configurator):
|
|
24
|
+
TYPE = BackendType.HOTAISLE
|
|
25
|
+
BACKEND_CLASS = HotAisleBackend
|
|
26
|
+
|
|
27
|
+
def validate_config(self, config: HotAisleBackendConfigWithCreds, default_creds_enabled: bool):
|
|
28
|
+
self._validate_creds(config.creds, config.team_handle)
|
|
29
|
+
|
|
30
|
+
def create_backend(
|
|
31
|
+
self, project_name: str, config: HotAisleBackendConfigWithCreds
|
|
32
|
+
) -> BackendRecord:
|
|
33
|
+
return BackendRecord(
|
|
34
|
+
config=HotAisleStoredConfig(
|
|
35
|
+
**HotAisleBackendConfig.__response__.parse_obj(config).dict()
|
|
36
|
+
).json(),
|
|
37
|
+
auth=HotAisleCreds.parse_obj(config.creds).json(),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def get_backend_config(
|
|
41
|
+
self, record: BackendRecord, include_creds: bool
|
|
42
|
+
) -> AnyHotAisleBackendConfig:
|
|
43
|
+
config = self._get_config(record)
|
|
44
|
+
if include_creds:
|
|
45
|
+
return HotAisleBackendConfigWithCreds.__response__.parse_obj(config)
|
|
46
|
+
return HotAisleBackendConfig.__response__.parse_obj(config)
|
|
47
|
+
|
|
48
|
+
def get_backend(self, record: BackendRecord) -> HotAisleBackend:
|
|
49
|
+
config = self._get_config(record)
|
|
50
|
+
return HotAisleBackend(config=config)
|
|
51
|
+
|
|
52
|
+
def _get_config(self, record: BackendRecord) -> HotAisleConfig:
|
|
53
|
+
return HotAisleConfig.__response__(
|
|
54
|
+
**json.loads(record.config),
|
|
55
|
+
creds=HotAisleCreds.parse_raw(record.auth),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def _validate_creds(self, creds: AnyHotAisleCreds, team_handle: str):
|
|
59
|
+
api_client = HotAisleAPIClient(creds.api_key, team_handle)
|
|
60
|
+
api_client.validate_api_key()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Annotated, List, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from dstack._internal.core.models.common import CoreModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HotAisleAPIKeyCreds(CoreModel):
|
|
9
|
+
type: Annotated[Literal["api_key"], Field(description="The type of credentials")] = "api_key"
|
|
10
|
+
api_key: Annotated[str, Field(description="The Hot Aisle API key")]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
AnyHotAisleCreds = HotAisleAPIKeyCreds
|
|
14
|
+
HotAisleCreds = AnyHotAisleCreds
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HotAisleBackendConfig(CoreModel):
|
|
18
|
+
type: Annotated[
|
|
19
|
+
Literal["hotaisle"],
|
|
20
|
+
Field(description="The type of backend"),
|
|
21
|
+
] = "hotaisle"
|
|
22
|
+
team_handle: Annotated[str, Field(description="The Hot Aisle team handle")]
|
|
23
|
+
regions: Annotated[
|
|
24
|
+
Optional[List[str]],
|
|
25
|
+
Field(description="The list of Hot Aisle regions. Omit to use all regions"),
|
|
26
|
+
] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class HotAisleBackendConfigWithCreds(HotAisleBackendConfig):
|
|
30
|
+
creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
AnyHotAisleBackendConfig = Union[HotAisleBackendConfig, HotAisleBackendConfigWithCreds]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class HotAisleBackendFileConfigWithCreds(HotAisleBackendConfig):
|
|
37
|
+
creds: Annotated[AnyHotAisleCreds, Field(description="The credentials")]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class HotAisleStoredConfig(HotAisleBackendConfig):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class HotAisleConfig(HotAisleStoredConfig):
|
|
45
|
+
creds: AnyHotAisleCreds
|
|
@@ -206,10 +206,11 @@ def _launch_runner(
|
|
|
206
206
|
ssh_private_key: str,
|
|
207
207
|
launch_command: str,
|
|
208
208
|
):
|
|
209
|
+
daemonized_command = f"{launch_command.rstrip('&')} >/tmp/dstack-shim.log 2>&1 & disown"
|
|
209
210
|
_run_ssh_command(
|
|
210
211
|
hostname=hostname,
|
|
211
212
|
ssh_private_key=ssh_private_key,
|
|
212
|
-
command=
|
|
213
|
+
command=daemonized_command,
|
|
213
214
|
)
|
|
214
215
|
|
|
215
216
|
|
|
@@ -29,6 +29,11 @@ from dstack._internal.core.backends.gcp.models import (
|
|
|
29
29
|
GCPBackendConfigWithCreds,
|
|
30
30
|
GCPBackendFileConfigWithCreds,
|
|
31
31
|
)
|
|
32
|
+
from dstack._internal.core.backends.hotaisle.models import (
|
|
33
|
+
HotAisleBackendConfig,
|
|
34
|
+
HotAisleBackendConfigWithCreds,
|
|
35
|
+
HotAisleBackendFileConfigWithCreds,
|
|
36
|
+
)
|
|
32
37
|
from dstack._internal.core.backends.kubernetes.models import (
|
|
33
38
|
KubernetesBackendConfig,
|
|
34
39
|
KubernetesBackendConfigWithCreds,
|
|
@@ -73,6 +78,7 @@ AnyBackendConfigWithoutCreds = Union[
|
|
|
73
78
|
CudoBackendConfig,
|
|
74
79
|
DataCrunchBackendConfig,
|
|
75
80
|
GCPBackendConfig,
|
|
81
|
+
HotAisleBackendConfig,
|
|
76
82
|
KubernetesBackendConfig,
|
|
77
83
|
LambdaBackendConfig,
|
|
78
84
|
NebiusBackendConfig,
|
|
@@ -95,6 +101,7 @@ AnyBackendConfigWithCreds = Union[
|
|
|
95
101
|
CudoBackendConfigWithCreds,
|
|
96
102
|
DataCrunchBackendConfigWithCreds,
|
|
97
103
|
GCPBackendConfigWithCreds,
|
|
104
|
+
HotAisleBackendConfigWithCreds,
|
|
98
105
|
KubernetesBackendConfigWithCreds,
|
|
99
106
|
LambdaBackendConfigWithCreds,
|
|
100
107
|
OCIBackendConfigWithCreds,
|
|
@@ -116,6 +123,7 @@ AnyBackendFileConfigWithCreds = Union[
|
|
|
116
123
|
CudoBackendConfigWithCreds,
|
|
117
124
|
DataCrunchBackendConfigWithCreds,
|
|
118
125
|
GCPBackendFileConfigWithCreds,
|
|
126
|
+
HotAisleBackendFileConfigWithCreds,
|
|
119
127
|
KubernetesBackendFileConfigWithCreds,
|
|
120
128
|
LambdaBackendConfigWithCreds,
|
|
121
129
|
OCIBackendConfigWithCreds,
|
|
@@ -57,6 +57,8 @@ def get_fleet_spec_excludes(fleet_spec: FleetSpec) -> Optional[IncludeExcludeDic
|
|
|
57
57
|
profile_excludes.add("startup_order")
|
|
58
58
|
if profile.stop_criteria is None:
|
|
59
59
|
profile_excludes.add("stop_criteria")
|
|
60
|
+
if profile.schedule is None:
|
|
61
|
+
profile_excludes.add("schedule")
|
|
60
62
|
if configuration_excludes:
|
|
61
63
|
spec_excludes["configuration"] = configuration_excludes
|
|
62
64
|
if profile_excludes:
|
|
@@ -53,6 +53,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
53
53
|
job_submissions_excludes["exit_status"] = True
|
|
54
54
|
if all(js.deployment_num == 0 for js in job_submissions):
|
|
55
55
|
job_submissions_excludes["deployment_num"] = True
|
|
56
|
+
if all(not js.probes for js in job_submissions):
|
|
57
|
+
job_submissions_excludes["probes"] = True
|
|
56
58
|
latest_job_submission = current_resource.latest_job_submission
|
|
57
59
|
if latest_job_submission is not None:
|
|
58
60
|
latest_job_submission_excludes: IncludeExcludeDictType = {}
|
|
@@ -69,6 +71,8 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
|
|
|
69
71
|
latest_job_submission_excludes["exit_status"] = True
|
|
70
72
|
if latest_job_submission.deployment_num == 0:
|
|
71
73
|
latest_job_submission_excludes["deployment_num"] = True
|
|
74
|
+
if not latest_job_submission.probes:
|
|
75
|
+
latest_job_submission_excludes["probes"] = True
|
|
72
76
|
return {"plan": apply_plan_excludes}
|
|
73
77
|
|
|
74
78
|
|
|
@@ -120,12 +124,18 @@ def get_run_spec_excludes(run_spec: RunSpec) -> IncludeExcludeDictType:
|
|
|
120
124
|
profile_excludes.add("startup_order")
|
|
121
125
|
if configuration.stop_criteria is None:
|
|
122
126
|
configuration_excludes["stop_criteria"] = True
|
|
127
|
+
if isinstance(configuration, ServiceConfiguration) and not configuration.probes:
|
|
128
|
+
configuration_excludes["probes"] = True
|
|
123
129
|
if profile is not None and profile.stop_criteria is None:
|
|
124
130
|
profile_excludes.add("stop_criteria")
|
|
125
131
|
if not configuration.files:
|
|
126
132
|
configuration_excludes["files"] = True
|
|
127
133
|
if not run_spec.file_archives:
|
|
128
134
|
spec_excludes["file_archives"] = True
|
|
135
|
+
if configuration.schedule is None:
|
|
136
|
+
configuration_excludes["schedule"] = True
|
|
137
|
+
if profile is not None and profile.schedule is None:
|
|
138
|
+
profile_excludes.add("schedule")
|
|
129
139
|
|
|
130
140
|
if configuration_excludes:
|
|
131
141
|
spec_excludes["configuration"] = configuration_excludes
|
|
@@ -150,6 +160,8 @@ def get_job_spec_excludes(job_specs: list[JobSpec]) -> IncludeExcludeDictType:
|
|
|
150
160
|
spec_excludes["file_archives"] = True
|
|
151
161
|
if all(s.service_port is None for s in job_specs):
|
|
152
162
|
spec_excludes["service_port"] = True
|
|
163
|
+
if all(not s.probes for s in job_specs):
|
|
164
|
+
spec_excludes["probes"] = True
|
|
153
165
|
|
|
154
166
|
return spec_excludes
|
|
155
167
|
|
|
@@ -11,6 +11,7 @@ class BackendType(str, enum.Enum):
|
|
|
11
11
|
DSTACK (BackendType): dstack Sky
|
|
12
12
|
GCP (BackendType): Google Cloud Platform
|
|
13
13
|
DATACRUNCH (BackendType): DataCrunch
|
|
14
|
+
HOTAISLE (BackendType): Hot Aisle
|
|
14
15
|
KUBERNETES (BackendType): Kubernetes
|
|
15
16
|
LAMBDA (BackendType): Lambda Cloud
|
|
16
17
|
NEBIUS (BackendType): Nebius AI Cloud
|
|
@@ -28,6 +29,7 @@ class BackendType(str, enum.Enum):
|
|
|
28
29
|
DATACRUNCH = "datacrunch"
|
|
29
30
|
DSTACK = "dstack"
|
|
30
31
|
GCP = "gcp"
|
|
32
|
+
HOTAISLE = "hotaisle"
|
|
31
33
|
KUBERNETES = "kubernetes"
|
|
32
34
|
LAMBDA = "lambda"
|
|
33
35
|
LOCAL = "local"
|
|
@@ -14,11 +14,12 @@ from dstack._internal.core.models.envs import Env
|
|
|
14
14
|
from dstack._internal.core.models.files import FilePathMapping
|
|
15
15
|
from dstack._internal.core.models.fleets import FleetConfiguration
|
|
16
16
|
from dstack._internal.core.models.gateways import GatewayConfiguration
|
|
17
|
-
from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
|
|
17
|
+
from dstack._internal.core.models.profiles import ProfileParams, parse_duration, parse_off_duration
|
|
18
18
|
from dstack._internal.core.models.resources import Range, ResourcesSpec
|
|
19
19
|
from dstack._internal.core.models.services import AnyModel, OpenAIChatModel
|
|
20
20
|
from dstack._internal.core.models.unix import UnixUser
|
|
21
21
|
from dstack._internal.core.models.volumes import MountPoint, VolumeConfiguration, parse_mount_point
|
|
22
|
+
from dstack._internal.utils.common import has_duplicates
|
|
22
23
|
from dstack._internal.utils.json_utils import (
|
|
23
24
|
pydantic_orjson_dumps_with_indent,
|
|
24
25
|
)
|
|
@@ -32,6 +33,14 @@ RUN_PRIOTIRY_MIN = 0
|
|
|
32
33
|
RUN_PRIOTIRY_MAX = 100
|
|
33
34
|
RUN_PRIORITY_DEFAULT = 0
|
|
34
35
|
DEFAULT_REPO_DIR = "/workflow"
|
|
36
|
+
MIN_PROBE_TIMEOUT = 1
|
|
37
|
+
MIN_PROBE_INTERVAL = 1
|
|
38
|
+
DEFAULT_PROBE_URL = "/"
|
|
39
|
+
DEFAULT_PROBE_TIMEOUT = 10
|
|
40
|
+
DEFAULT_PROBE_INTERVAL = 15
|
|
41
|
+
DEFAULT_PROBE_READY_AFTER = 1
|
|
42
|
+
DEFAULT_PROBE_METHOD = "get"
|
|
43
|
+
MAX_PROBE_URL_LEN = 2048
|
|
35
44
|
|
|
36
45
|
|
|
37
46
|
class RunConfigurationType(str, Enum):
|
|
@@ -162,6 +171,121 @@ class RateLimit(CoreModel):
|
|
|
162
171
|
] = 0
|
|
163
172
|
|
|
164
173
|
|
|
174
|
+
HTTPMethod = Literal["get", "post", "put", "delete", "patch", "head"]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class HTTPHeaderSpec(CoreModel):
|
|
178
|
+
name: Annotated[
|
|
179
|
+
str,
|
|
180
|
+
Field(
|
|
181
|
+
description="The name of the HTTP header",
|
|
182
|
+
min_length=1,
|
|
183
|
+
max_length=256,
|
|
184
|
+
),
|
|
185
|
+
]
|
|
186
|
+
value: Annotated[
|
|
187
|
+
str,
|
|
188
|
+
Field(
|
|
189
|
+
description="The value of the HTTP header",
|
|
190
|
+
min_length=1,
|
|
191
|
+
max_length=2048,
|
|
192
|
+
),
|
|
193
|
+
]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class ProbeConfig(CoreModel):
|
|
197
|
+
type: Literal["http"] # expect other probe types in the future, namely `exec`
|
|
198
|
+
url: Annotated[
|
|
199
|
+
Optional[str], Field(description=f"The URL to request. Defaults to `{DEFAULT_PROBE_URL}`")
|
|
200
|
+
] = None
|
|
201
|
+
method: Annotated[
|
|
202
|
+
Optional[HTTPMethod],
|
|
203
|
+
Field(
|
|
204
|
+
description=(
|
|
205
|
+
"The HTTP method to use for the probe (e.g., `get`, `post`, etc.)."
|
|
206
|
+
f" Defaults to `{DEFAULT_PROBE_METHOD}`"
|
|
207
|
+
)
|
|
208
|
+
),
|
|
209
|
+
] = None
|
|
210
|
+
headers: Annotated[
|
|
211
|
+
list[HTTPHeaderSpec],
|
|
212
|
+
Field(description="A list of HTTP headers to include in the request", max_items=16),
|
|
213
|
+
] = []
|
|
214
|
+
body: Annotated[
|
|
215
|
+
Optional[str],
|
|
216
|
+
Field(
|
|
217
|
+
description="The HTTP request body to send with the probe",
|
|
218
|
+
min_length=1,
|
|
219
|
+
max_length=2048,
|
|
220
|
+
),
|
|
221
|
+
] = None
|
|
222
|
+
timeout: Annotated[
|
|
223
|
+
Optional[Union[int, str]],
|
|
224
|
+
Field(
|
|
225
|
+
description=(
|
|
226
|
+
f"Maximum amount of time the HTTP request is allowed to take. Defaults to `{DEFAULT_PROBE_TIMEOUT}s`"
|
|
227
|
+
)
|
|
228
|
+
),
|
|
229
|
+
] = None
|
|
230
|
+
interval: Annotated[
|
|
231
|
+
Optional[Union[int, str]],
|
|
232
|
+
Field(
|
|
233
|
+
description=(
|
|
234
|
+
"Minimum amount of time between the end of one probe execution"
|
|
235
|
+
f" and the start of the next. Defaults to `{DEFAULT_PROBE_INTERVAL}s`"
|
|
236
|
+
)
|
|
237
|
+
),
|
|
238
|
+
] = None
|
|
239
|
+
ready_after: Annotated[
|
|
240
|
+
Optional[int],
|
|
241
|
+
Field(
|
|
242
|
+
ge=1,
|
|
243
|
+
description=(
|
|
244
|
+
"The number of consecutive successful probe executions required for the replica"
|
|
245
|
+
" to be considered ready. Used during rolling deployments."
|
|
246
|
+
f" Defaults to `{DEFAULT_PROBE_READY_AFTER}`"
|
|
247
|
+
),
|
|
248
|
+
),
|
|
249
|
+
] = None
|
|
250
|
+
|
|
251
|
+
@validator("timeout")
|
|
252
|
+
def parse_timeout(cls, v: Optional[Union[int, str]]) -> Optional[int]:
|
|
253
|
+
if v is None:
|
|
254
|
+
return v
|
|
255
|
+
parsed = parse_duration(v)
|
|
256
|
+
if parsed < MIN_PROBE_TIMEOUT:
|
|
257
|
+
raise ValueError(f"Probe timeout cannot be shorter than {MIN_PROBE_TIMEOUT}s")
|
|
258
|
+
return parsed
|
|
259
|
+
|
|
260
|
+
@validator("interval")
|
|
261
|
+
def parse_interval(cls, v: Optional[Union[int, str]]) -> Optional[int]:
|
|
262
|
+
if v is None:
|
|
263
|
+
return v
|
|
264
|
+
parsed = parse_duration(v)
|
|
265
|
+
if parsed < MIN_PROBE_INTERVAL:
|
|
266
|
+
raise ValueError(f"Probe interval cannot be shorter than {MIN_PROBE_INTERVAL}s")
|
|
267
|
+
return parsed
|
|
268
|
+
|
|
269
|
+
@validator("url")
|
|
270
|
+
def validate_url(cls, v: Optional[str]) -> Optional[str]:
|
|
271
|
+
if v is None:
|
|
272
|
+
return v
|
|
273
|
+
if not v.startswith("/"):
|
|
274
|
+
raise ValueError("Must start with `/`")
|
|
275
|
+
if len(v) > MAX_PROBE_URL_LEN:
|
|
276
|
+
raise ValueError(f"Cannot be longer than {MAX_PROBE_URL_LEN} characters")
|
|
277
|
+
if not v.isprintable():
|
|
278
|
+
raise ValueError("Cannot contain non-printable characters")
|
|
279
|
+
return v
|
|
280
|
+
|
|
281
|
+
@root_validator
|
|
282
|
+
def validate_body_matches_method(cls, values):
|
|
283
|
+
method: HTTPMethod = values["method"]
|
|
284
|
+
if values["body"] is not None and method in ["get", "head"]:
|
|
285
|
+
raise ValueError(f"Cannot set request body for the `{method}` method")
|
|
286
|
+
return values
|
|
287
|
+
|
|
288
|
+
|
|
165
289
|
class BaseRunConfiguration(CoreModel):
|
|
166
290
|
type: Literal["none"]
|
|
167
291
|
name: Annotated[
|
|
@@ -448,6 +572,10 @@ class ServiceConfigurationParams(CoreModel):
|
|
|
448
572
|
Field(description="The auto-scaling rules. Required if `replicas` is set to a range"),
|
|
449
573
|
] = None
|
|
450
574
|
rate_limits: Annotated[list[RateLimit], Field(description="Rate limiting rules")] = []
|
|
575
|
+
probes: Annotated[
|
|
576
|
+
list[ProbeConfig],
|
|
577
|
+
Field(description="List of probes used to determine job health"),
|
|
578
|
+
] = []
|
|
451
579
|
|
|
452
580
|
@validator("port")
|
|
453
581
|
def convert_port(cls, v) -> PortMapping:
|
|
@@ -511,6 +639,16 @@ class ServiceConfigurationParams(CoreModel):
|
|
|
511
639
|
)
|
|
512
640
|
return v
|
|
513
641
|
|
|
642
|
+
@validator("probes")
|
|
643
|
+
def validate_probes(cls, v: list[ProbeConfig]) -> list[ProbeConfig]:
|
|
644
|
+
if has_duplicates(v):
|
|
645
|
+
# Using a custom validator instead of Field(unique_items=True) to avoid Pydantic bug:
|
|
646
|
+
# https://github.com/pydantic/pydantic/issues/3765
|
|
647
|
+
# Because of the bug, our gen_schema_reference.py fails to determine the type of
|
|
648
|
+
# ServiceConfiguration.probes and insert the correct hyperlink.
|
|
649
|
+
raise ValueError("Probes must be unique")
|
|
650
|
+
return v
|
|
651
|
+
|
|
514
652
|
|
|
515
653
|
class ServiceConfiguration(
|
|
516
654
|
ProfileParams, BaseRunConfigurationWithCommands, ServiceConfigurationParams
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from dstack._internal.core.models.common import CoreModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class HealthStatus(str, Enum):
|
|
8
|
+
HEALTHY = "healthy"
|
|
9
|
+
WARNING = "warning"
|
|
10
|
+
FAILURE = "failure"
|
|
11
|
+
|
|
12
|
+
def is_healthy(self) -> bool:
|
|
13
|
+
return self == self.HEALTHY
|
|
14
|
+
|
|
15
|
+
def is_failure(self) -> bool:
|
|
16
|
+
return self == self.FAILURE
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HealthEvent(CoreModel):
|
|
20
|
+
timestamp: datetime
|
|
21
|
+
status: HealthStatus
|
|
22
|
+
message: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class HealthCheck(CoreModel):
|
|
26
|
+
collected_at: datetime
|
|
27
|
+
status: HealthStatus
|
|
28
|
+
events: list[HealthEvent]
|
|
@@ -9,6 +9,7 @@ from pydantic import root_validator
|
|
|
9
9
|
from dstack._internal.core.models.backends.base import BackendType
|
|
10
10
|
from dstack._internal.core.models.common import CoreModel
|
|
11
11
|
from dstack._internal.core.models.envs import Env
|
|
12
|
+
from dstack._internal.core.models.health import HealthStatus
|
|
12
13
|
from dstack._internal.core.models.volumes import Volume
|
|
13
14
|
from dstack._internal.utils.common import pretty_resources
|
|
14
15
|
|
|
@@ -225,6 +226,7 @@ class Instance(CoreModel):
|
|
|
225
226
|
hostname: Optional[str] = None
|
|
226
227
|
status: InstanceStatus
|
|
227
228
|
unreachable: bool = False
|
|
229
|
+
health_status: HealthStatus = HealthStatus.HEALTHY
|
|
228
230
|
termination_reason: Optional[str] = None
|
|
229
231
|
created: datetime.datetime
|
|
230
232
|
region: Optional[str] = None
|