dstack 0.18.40__py3-none-any.whl → 0.18.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/apply.py +8 -5
- dstack/_internal/cli/services/configurators/base.py +4 -2
- dstack/_internal/cli/services/configurators/fleet.py +21 -9
- dstack/_internal/cli/services/configurators/gateway.py +15 -0
- dstack/_internal/cli/services/configurators/run.py +6 -5
- dstack/_internal/cli/services/configurators/volume.py +15 -0
- dstack/_internal/cli/services/repos.py +3 -3
- dstack/_internal/cli/utils/fleet.py +44 -33
- dstack/_internal/cli/utils/run.py +27 -7
- dstack/_internal/cli/utils/volume.py +21 -9
- dstack/_internal/core/backends/aws/compute.py +92 -52
- dstack/_internal/core/backends/aws/resources.py +22 -12
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/gcp/compute.py +30 -23
- dstack/_internal/core/backends/gcp/resources.py +0 -15
- dstack/_internal/core/backends/oci/compute.py +10 -5
- dstack/_internal/core/backends/oci/resources.py +23 -26
- dstack/_internal/core/backends/remote/provisioning.py +65 -27
- dstack/_internal/core/backends/runpod/compute.py +1 -0
- dstack/_internal/core/models/backends/azure.py +3 -1
- dstack/_internal/core/models/configurations.py +24 -1
- dstack/_internal/core/models/fleets.py +46 -0
- dstack/_internal/core/models/instances.py +5 -1
- dstack/_internal/core/models/pools.py +4 -1
- dstack/_internal/core/models/profiles.py +10 -4
- dstack/_internal/core/models/runs.py +20 -0
- dstack/_internal/core/models/volumes.py +3 -0
- dstack/_internal/core/services/ssh/attach.py +92 -53
- dstack/_internal/core/services/ssh/tunnel.py +58 -31
- dstack/_internal/proxy/gateway/routers/registry.py +2 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
- dstack/_internal/proxy/gateway/services/registry.py +4 -0
- dstack/_internal/proxy/lib/models.py +3 -0
- dstack/_internal/proxy/lib/services/service_connection.py +8 -1
- dstack/_internal/server/background/tasks/process_instances.py +72 -33
- dstack/_internal/server/background/tasks/process_metrics.py +9 -9
- dstack/_internal/server/background/tasks/process_running_jobs.py +73 -26
- dstack/_internal/server/background/tasks/process_runs.py +2 -12
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +109 -42
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -1
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/models.py +10 -4
- dstack/_internal/server/routers/runs.py +1 -0
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/configurators/azure.py +34 -8
- dstack/_internal/server/services/config.py +9 -0
- dstack/_internal/server/services/fleets.py +27 -2
- dstack/_internal/server/services/gateways/client.py +9 -1
- dstack/_internal/server/services/jobs/__init__.py +215 -43
- dstack/_internal/server/services/jobs/configurators/base.py +47 -2
- dstack/_internal/server/services/offers.py +91 -5
- dstack/_internal/server/services/pools.py +95 -11
- dstack/_internal/server/services/proxy/repo.py +17 -3
- dstack/_internal/server/services/runner/client.py +1 -1
- dstack/_internal/server/services/runner/ssh.py +33 -5
- dstack/_internal/server/services/runs.py +48 -179
- dstack/_internal/server/services/services/__init__.py +9 -1
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
- dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
- dstack/_internal/server/testing/common.py +117 -52
- dstack/_internal/utils/common.py +22 -8
- dstack/_internal/utils/env.py +14 -0
- dstack/_internal/utils/ssh.py +1 -1
- dstack/api/server/_fleets.py +25 -1
- dstack/api/server/_runs.py +23 -2
- dstack/api/server/_volumes.py +12 -1
- dstack/version.py +1 -1
- {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/METADATA +1 -1
- {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/RECORD +98 -89
- tests/_internal/cli/services/configurators/test_profile.py +3 -3
- tests/_internal/core/services/ssh/test_tunnel.py +56 -4
- tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
- tests/_internal/server/background/tasks/test_process_instances.py +138 -20
- tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +192 -0
- tests/_internal/server/background/tasks/test_process_runs.py +27 -3
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +48 -3
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +126 -13
- tests/_internal/server/routers/test_fleets.py +15 -2
- tests/_internal/server/routers/test_pools.py +6 -0
- tests/_internal/server/routers/test_runs.py +27 -0
- tests/_internal/server/services/jobs/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
- tests/_internal/server/services/test_pools.py +4 -0
- tests/_internal/server/services/test_runs.py +5 -41
- tests/_internal/utils/test_common.py +21 -0
- tests/_internal/utils/test_env.py +38 -0
- {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/WHEEL +0 -0
- {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
3
|
import time
|
|
4
|
-
from contextlib import contextmanager
|
|
4
|
+
from contextlib import contextmanager, nullcontext
|
|
5
5
|
from textwrap import dedent
|
|
6
|
-
from typing import Any, Dict, Generator, List
|
|
6
|
+
from typing import Any, Dict, Generator, List, Optional
|
|
7
7
|
|
|
8
8
|
import paramiko
|
|
9
9
|
from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
|
|
@@ -17,6 +17,7 @@ from dstack._internal.core.models.instances import (
|
|
|
17
17
|
Gpu,
|
|
18
18
|
InstanceType,
|
|
19
19
|
Resources,
|
|
20
|
+
SSHConnectionParams,
|
|
20
21
|
)
|
|
21
22
|
from dstack._internal.utils.gpu import (
|
|
22
23
|
convert_amd_gpu_name,
|
|
@@ -262,35 +263,72 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
|
|
|
262
263
|
|
|
263
264
|
@contextmanager
|
|
264
265
|
def get_paramiko_connection(
|
|
265
|
-
ssh_user: str,
|
|
266
|
+
ssh_user: str,
|
|
267
|
+
host: str,
|
|
268
|
+
port: int,
|
|
269
|
+
pkeys: List[paramiko.PKey],
|
|
270
|
+
proxy: Optional[SSHConnectionParams] = None,
|
|
271
|
+
proxy_pkeys: Optional[list[paramiko.PKey]] = None,
|
|
266
272
|
) -> Generator[paramiko.SSHClient, None, None]:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
273
|
+
if proxy is not None:
|
|
274
|
+
if proxy_pkeys is None:
|
|
275
|
+
raise ProvisioningError("Missing proxy private keys")
|
|
276
|
+
proxy_ctx = get_paramiko_connection(
|
|
277
|
+
proxy.username, proxy.hostname, proxy.port, proxy_pkeys
|
|
278
|
+
)
|
|
279
|
+
else:
|
|
280
|
+
proxy_ctx = nullcontext()
|
|
281
|
+
conn_url = f"{ssh_user}@{host}:{port}"
|
|
282
|
+
with proxy_ctx as proxy_client, paramiko.SSHClient() as client:
|
|
283
|
+
proxy_channel: Optional[paramiko.Channel] = None
|
|
284
|
+
if proxy_client is not None:
|
|
271
285
|
try:
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
username=ssh_user,
|
|
275
|
-
hostname=host,
|
|
276
|
-
port=port,
|
|
277
|
-
pkey=pkey,
|
|
278
|
-
look_for_keys=False,
|
|
279
|
-
allow_agent=False,
|
|
280
|
-
timeout=SSH_CONNECT_TIMEOUT,
|
|
286
|
+
proxy_channel = proxy_client.get_transport().open_channel(
|
|
287
|
+
"direct-tcpip", (host, port), ("", 0)
|
|
281
288
|
)
|
|
282
|
-
except paramiko.AuthenticationException:
|
|
283
|
-
logger.debug(
|
|
284
|
-
f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
|
|
285
|
-
)
|
|
286
|
-
continue # try next key
|
|
287
289
|
except (paramiko.SSHException, OSError) as e:
|
|
288
|
-
raise ProvisioningError(f"
|
|
289
|
-
|
|
290
|
+
raise ProvisioningError(f"Proxy channel failed: {e}") from e
|
|
291
|
+
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
292
|
+
for pkey in pkeys:
|
|
293
|
+
logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
|
|
294
|
+
connected = _paramiko_connect(client, ssh_user, host, port, pkey, proxy_channel)
|
|
295
|
+
if connected:
|
|
290
296
|
yield client
|
|
291
297
|
return
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
raise ProvisioningError(
|
|
295
|
-
f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
|
|
298
|
+
logger.debug(
|
|
299
|
+
f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
|
|
296
300
|
)
|
|
301
|
+
keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
|
|
302
|
+
raise ProvisioningError(
|
|
303
|
+
f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _paramiko_connect(
|
|
308
|
+
client: paramiko.SSHClient,
|
|
309
|
+
user: str,
|
|
310
|
+
host: str,
|
|
311
|
+
port: int,
|
|
312
|
+
pkey: paramiko.PKey,
|
|
313
|
+
channel: Optional[paramiko.Channel] = None,
|
|
314
|
+
) -> bool:
|
|
315
|
+
"""
|
|
316
|
+
Returns `True` if connected, `False` if auth failed, and raises `ProvisioningError`
|
|
317
|
+
on other errors.
|
|
318
|
+
"""
|
|
319
|
+
try:
|
|
320
|
+
client.connect(
|
|
321
|
+
username=user,
|
|
322
|
+
hostname=host,
|
|
323
|
+
port=port,
|
|
324
|
+
pkey=pkey,
|
|
325
|
+
look_for_keys=False,
|
|
326
|
+
allow_agent=False,
|
|
327
|
+
timeout=SSH_CONNECT_TIMEOUT,
|
|
328
|
+
sock=channel,
|
|
329
|
+
)
|
|
330
|
+
return True
|
|
331
|
+
except paramiko.AuthenticationException:
|
|
332
|
+
return False
|
|
333
|
+
except (paramiko.SSHException, OSError) as e:
|
|
334
|
+
raise ProvisioningError(f"Connect failed: {e}") from e
|
|
@@ -11,6 +11,7 @@ class AzureConfigInfo(CoreModel):
|
|
|
11
11
|
type: Literal["azure"] = "azure"
|
|
12
12
|
tenant_id: str
|
|
13
13
|
subscription_id: str
|
|
14
|
+
resource_group: Optional[str] = None
|
|
14
15
|
locations: Optional[List[str]] = None
|
|
15
16
|
vpc_ids: Optional[Dict[str, str]] = None
|
|
16
17
|
public_ips: Optional[bool] = None
|
|
@@ -48,6 +49,7 @@ class AzureConfigInfoWithCredsPartial(CoreModel):
|
|
|
48
49
|
creds: Optional[AnyAzureCreds]
|
|
49
50
|
tenant_id: Optional[str]
|
|
50
51
|
subscription_id: Optional[str]
|
|
52
|
+
resource_group: Optional[str]
|
|
51
53
|
locations: Optional[List[str]]
|
|
52
54
|
vpc_ids: Optional[Dict[str, str]]
|
|
53
55
|
public_ips: Optional[bool]
|
|
@@ -63,4 +65,4 @@ class AzureConfigValues(CoreModel):
|
|
|
63
65
|
|
|
64
66
|
|
|
65
67
|
class AzureStoredConfig(AzureConfigInfo):
|
|
66
|
-
resource_group: str
|
|
68
|
+
resource_group: str = ""
|
|
@@ -10,7 +10,7 @@ from dstack._internal.core.models.common import CoreModel, Duration, RegistryAut
|
|
|
10
10
|
from dstack._internal.core.models.envs import Env
|
|
11
11
|
from dstack._internal.core.models.fleets import FleetConfiguration
|
|
12
12
|
from dstack._internal.core.models.gateways import GatewayConfiguration
|
|
13
|
-
from dstack._internal.core.models.profiles import ProfileParams
|
|
13
|
+
from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
|
|
14
14
|
from dstack._internal.core.models.repos.base import Repo
|
|
15
15
|
from dstack._internal.core.models.repos.virtual import VirtualRepo
|
|
16
16
|
from dstack._internal.core.models.resources import Range, ResourcesSpec
|
|
@@ -212,6 +212,29 @@ class DevEnvironmentConfigurationParams(CoreModel):
|
|
|
212
212
|
ide: Annotated[Literal["vscode"], Field(description="The IDE to run")]
|
|
213
213
|
version: Annotated[Optional[str], Field(description="The version of the IDE")]
|
|
214
214
|
init: Annotated[CommandsList, Field(description="The bash commands to run on startup")] = []
|
|
215
|
+
inactivity_duration: Annotated[
|
|
216
|
+
Optional[Union[Literal["off"], int, bool, str]],
|
|
217
|
+
Field(
|
|
218
|
+
description=(
|
|
219
|
+
"The maximum amount of time the dev environment can be inactive"
|
|
220
|
+
" (e.g., `2h`, `1d`, etc)."
|
|
221
|
+
" After it elapses, the dev environment is automatically stopped."
|
|
222
|
+
" Inactivity is defined as the absence of SSH connections to the"
|
|
223
|
+
" dev environment, including VS Code connections, `ssh <run name>`"
|
|
224
|
+
" shells, and attached `dstack apply` or `dstack attach` commands."
|
|
225
|
+
" Use `off` for unlimited duration. Defaults to `off`"
|
|
226
|
+
)
|
|
227
|
+
),
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
@validator("inactivity_duration", pre=True, allow_reuse=True)
|
|
231
|
+
def parse_inactivity_duration(
|
|
232
|
+
cls, v: Optional[Union[Literal["off"], int, bool, str]]
|
|
233
|
+
) -> Optional[int]:
|
|
234
|
+
v = parse_off_duration(v)
|
|
235
|
+
if isinstance(v, int):
|
|
236
|
+
return v
|
|
237
|
+
return None
|
|
215
238
|
|
|
216
239
|
|
|
217
240
|
class DevEnvironmentConfiguration(
|
|
@@ -39,6 +39,14 @@ class InstanceGroupPlacement(str, Enum):
|
|
|
39
39
|
CLUSTER = "cluster"
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
class SSHProxyParams(CoreModel):
|
|
43
|
+
hostname: Annotated[str, Field(description="The IP address or domain of proxy host")]
|
|
44
|
+
port: Annotated[Optional[int], Field(description="The SSH port of proxy host")] = None
|
|
45
|
+
user: Annotated[str, Field(description="The user to log in with for proxy host")]
|
|
46
|
+
identity_file: Annotated[str, Field(description="The private key to use for proxy host")]
|
|
47
|
+
ssh_key: Optional[SSHKey] = None
|
|
48
|
+
|
|
49
|
+
|
|
42
50
|
class SSHHostParams(CoreModel):
|
|
43
51
|
hostname: Annotated[str, Field(description="The IP address or domain to connect to")]
|
|
44
52
|
port: Annotated[
|
|
@@ -50,6 +58,9 @@ class SSHHostParams(CoreModel):
|
|
|
50
58
|
identity_file: Annotated[
|
|
51
59
|
Optional[str], Field(description="The private key to use for this host")
|
|
52
60
|
] = None
|
|
61
|
+
proxy_jump: Annotated[
|
|
62
|
+
Optional[SSHProxyParams], Field(description="The SSH proxy configuration for this host")
|
|
63
|
+
] = None
|
|
53
64
|
internal_ip: Annotated[
|
|
54
65
|
Optional[str],
|
|
55
66
|
Field(
|
|
@@ -61,6 +72,19 @@ class SSHHostParams(CoreModel):
|
|
|
61
72
|
] = None
|
|
62
73
|
ssh_key: Optional[SSHKey] = None
|
|
63
74
|
|
|
75
|
+
blocks: Annotated[
|
|
76
|
+
Union[Literal["auto"], int],
|
|
77
|
+
Field(
|
|
78
|
+
description=(
|
|
79
|
+
"The amount of blocks to split the instance into, a number or `auto`."
|
|
80
|
+
" `auto` means as many as possible."
|
|
81
|
+
" The number of GPUs and CPUs must be divisible by the number of blocks."
|
|
82
|
+
" Defaults to `1`, i.e. do not split"
|
|
83
|
+
),
|
|
84
|
+
ge=1,
|
|
85
|
+
),
|
|
86
|
+
] = 1
|
|
87
|
+
|
|
64
88
|
@validator("internal_ip")
|
|
65
89
|
def validate_internal_ip(cls, value):
|
|
66
90
|
if value is None:
|
|
@@ -83,6 +107,9 @@ class SSHParams(CoreModel):
|
|
|
83
107
|
Optional[str], Field(description="The private key to use for all hosts")
|
|
84
108
|
] = None
|
|
85
109
|
ssh_key: Optional[SSHKey] = None
|
|
110
|
+
proxy_jump: Annotated[
|
|
111
|
+
Optional[SSHProxyParams], Field(description="The SSH proxy configuration for all hosts")
|
|
112
|
+
] = None
|
|
86
113
|
hosts: Annotated[
|
|
87
114
|
List[Union[SSHHostParams, str]],
|
|
88
115
|
Field(
|
|
@@ -142,6 +169,19 @@ class InstanceGroupParams(CoreModel):
|
|
|
142
169
|
Field(description="The resources requirements"),
|
|
143
170
|
] = ResourcesSpec()
|
|
144
171
|
|
|
172
|
+
blocks: Annotated[
|
|
173
|
+
Union[Literal["auto"], int],
|
|
174
|
+
Field(
|
|
175
|
+
description=(
|
|
176
|
+
"The amount of blocks to split the instance into, a number or `auto`."
|
|
177
|
+
" `auto` means as many as possible."
|
|
178
|
+
" The number of GPUs and CPUs must be divisible by the number of blocks."
|
|
179
|
+
" Defaults to `1`, i.e. do not split"
|
|
180
|
+
),
|
|
181
|
+
ge=1,
|
|
182
|
+
),
|
|
183
|
+
] = 1
|
|
184
|
+
|
|
145
185
|
backends: Annotated[
|
|
146
186
|
Optional[List[BackendType]],
|
|
147
187
|
Field(description="The backends to consider for provisioning (e.g., `[aws, gcp]`)"),
|
|
@@ -152,6 +192,12 @@ class InstanceGroupParams(CoreModel):
|
|
|
152
192
|
description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
|
|
153
193
|
),
|
|
154
194
|
] = None
|
|
195
|
+
availability_zones: Annotated[
|
|
196
|
+
Optional[List[str]],
|
|
197
|
+
Field(
|
|
198
|
+
description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
|
|
199
|
+
),
|
|
200
|
+
] = None
|
|
155
201
|
instance_types: Annotated[
|
|
156
202
|
Optional[List[str]],
|
|
157
203
|
Field(
|
|
@@ -92,6 +92,8 @@ class RemoteConnectionInfo(CoreModel):
|
|
|
92
92
|
port: int
|
|
93
93
|
ssh_user: str
|
|
94
94
|
ssh_keys: List[SSHKey]
|
|
95
|
+
ssh_proxy: Optional[SSHConnectionParams] = None
|
|
96
|
+
ssh_proxy_keys: Optional[list[SSHKey]] = None
|
|
95
97
|
env: Env = Env()
|
|
96
98
|
|
|
97
99
|
|
|
@@ -101,7 +103,6 @@ class InstanceConfiguration(CoreModel):
|
|
|
101
103
|
user: str # dstack user name
|
|
102
104
|
ssh_keys: List[SSHKey]
|
|
103
105
|
instance_id: Optional[str] = None
|
|
104
|
-
availability_zone: Optional[str] = None
|
|
105
106
|
placement_group_name: Optional[str] = None
|
|
106
107
|
reservation: Optional[str] = None
|
|
107
108
|
volumes: Optional[List[Volume]] = None
|
|
@@ -140,7 +141,10 @@ class InstanceOffer(CoreModel):
|
|
|
140
141
|
|
|
141
142
|
class InstanceOfferWithAvailability(InstanceOffer):
|
|
142
143
|
availability: InstanceAvailability
|
|
144
|
+
availability_zones: Optional[List[str]] = None
|
|
143
145
|
instance_runtime: InstanceRuntime = InstanceRuntime.SHIM
|
|
146
|
+
blocks: int = 1
|
|
147
|
+
total_blocks: int = 1
|
|
144
148
|
|
|
145
149
|
|
|
146
150
|
class InstanceStatus(str, Enum):
|
|
@@ -25,14 +25,17 @@ class Instance(CoreModel):
|
|
|
25
25
|
fleet_name: Optional[str] = None
|
|
26
26
|
instance_num: int
|
|
27
27
|
pool_name: Optional[str] = None
|
|
28
|
-
job_name: Optional[str] = None
|
|
28
|
+
job_name: Optional[str] = None # deprecated, always None (instance can have more than one job)
|
|
29
29
|
hostname: Optional[str] = None
|
|
30
30
|
status: InstanceStatus
|
|
31
31
|
unreachable: bool = False
|
|
32
32
|
termination_reason: Optional[str] = None
|
|
33
33
|
created: datetime.datetime
|
|
34
34
|
region: Optional[str] = None
|
|
35
|
+
availability_zone: Optional[str] = None
|
|
35
36
|
price: Optional[float] = None
|
|
37
|
+
total_blocks: Optional[int] = None
|
|
38
|
+
busy_blocks: int = 0
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
class PoolInstances(CoreModel):
|
|
@@ -40,15 +40,15 @@ def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
|
|
|
40
40
|
return Duration.parse(v)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int
|
|
43
|
+
def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
|
|
44
44
|
return parse_off_duration(v)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int
|
|
47
|
+
def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
|
|
48
48
|
return parse_off_duration(v)
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int
|
|
51
|
+
def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
|
|
52
52
|
if v == "off" or v is False:
|
|
53
53
|
return "off"
|
|
54
54
|
if v is True:
|
|
@@ -123,6 +123,12 @@ class ProfileParams(CoreModel):
|
|
|
123
123
|
description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
|
|
124
124
|
),
|
|
125
125
|
]
|
|
126
|
+
availability_zones: Annotated[
|
|
127
|
+
Optional[List[str]],
|
|
128
|
+
Field(
|
|
129
|
+
description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
|
|
130
|
+
),
|
|
131
|
+
] = None
|
|
126
132
|
instance_types: Annotated[
|
|
127
133
|
Optional[List[str]],
|
|
128
134
|
Field(
|
|
@@ -162,7 +168,7 @@ class ProfileParams(CoreModel):
|
|
|
162
168
|
Optional[Union[Literal["off"], str, int, bool]],
|
|
163
169
|
Field(
|
|
164
170
|
description=(
|
|
165
|
-
"The maximum duration of a run
|
|
171
|
+
"The maximum duration of a run graceful stopping."
|
|
166
172
|
" After it elapses, the run is automatically forced stopped."
|
|
167
173
|
" This includes force detaching volumes used by the run."
|
|
168
174
|
" Use `off` for unlimited duration. Defaults to `5m`"
|
|
@@ -27,6 +27,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
27
27
|
from dstack._internal.core.models.repos import AnyRunRepoData
|
|
28
28
|
from dstack._internal.core.models.resources import Memory, ResourcesSpec
|
|
29
29
|
from dstack._internal.core.models.unix import UnixUser
|
|
30
|
+
from dstack._internal.core.models.volumes import MountPoint
|
|
30
31
|
from dstack._internal.utils import common as common_utils
|
|
31
32
|
from dstack._internal.utils.common import format_pretty_duration
|
|
32
33
|
|
|
@@ -112,6 +113,7 @@ class JobTerminationReason(str, Enum):
|
|
|
112
113
|
DONE_BY_RUNNER = "done_by_runner"
|
|
113
114
|
ABORTED_BY_USER = "aborted_by_user"
|
|
114
115
|
TERMINATED_BY_SERVER = "terminated_by_server"
|
|
116
|
+
INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
|
|
115
117
|
# Set by the runner
|
|
116
118
|
CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
|
|
117
119
|
PORTS_BINDING_FAILED = "ports_binding_failed"
|
|
@@ -132,6 +134,7 @@ class JobTerminationReason(str, Enum):
|
|
|
132
134
|
self.DONE_BY_RUNNER: JobStatus.DONE,
|
|
133
135
|
self.ABORTED_BY_USER: JobStatus.ABORTED,
|
|
134
136
|
self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
|
|
137
|
+
self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
|
|
135
138
|
self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
|
|
136
139
|
self.PORTS_BINDING_FAILED: JobStatus.FAILED,
|
|
137
140
|
self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
|
|
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
|
|
|
190
193
|
registry_auth: Optional[RegistryAuth]
|
|
191
194
|
requirements: Requirements
|
|
192
195
|
retry: Optional[Retry]
|
|
196
|
+
volumes: Optional[List[MountPoint]] = None
|
|
193
197
|
# For backward compatibility with 0.18.x when retry_policy was required.
|
|
194
198
|
# TODO: remove in 0.19
|
|
195
199
|
retry_policy: ProfileRetryPolicy = ProfileRetryPolicy(retry=False)
|
|
@@ -231,6 +235,17 @@ class JobProvisioningData(CoreModel):
|
|
|
231
235
|
|
|
232
236
|
|
|
233
237
|
class JobRuntimeData(CoreModel):
|
|
238
|
+
"""
|
|
239
|
+
Holds various information only available after the job is submitted, such as:
|
|
240
|
+
* offer (depends on the instance)
|
|
241
|
+
* volumes used by the job
|
|
242
|
+
* resource constraints for container (depend on the instance)
|
|
243
|
+
* port mapping (reported by the shim only after the container is started)
|
|
244
|
+
|
|
245
|
+
Some fields are mutable, for example, `ports` only available when the shim starts
|
|
246
|
+
the container.
|
|
247
|
+
"""
|
|
248
|
+
|
|
234
249
|
network_mode: NetworkMode
|
|
235
250
|
# GPU, CPU, memory resource shares. None means all available (no limit)
|
|
236
251
|
gpu: Optional[int] = None
|
|
@@ -240,6 +255,10 @@ class JobRuntimeData(CoreModel):
|
|
|
240
255
|
# None if data is not yet available (on vm-based backends and ssh instances)
|
|
241
256
|
# or not applicable (container-based backends)
|
|
242
257
|
ports: Optional[dict[int, int]] = None
|
|
258
|
+
# List of volumes used by the job
|
|
259
|
+
volume_names: Optional[list[str]] = None # None for backward compalibility
|
|
260
|
+
# Virtual shared offer
|
|
261
|
+
offer: Optional[InstanceOfferWithAvailability] = None # None for backward compalibility
|
|
243
262
|
|
|
244
263
|
|
|
245
264
|
class ClusterInfo(CoreModel):
|
|
@@ -254,6 +273,7 @@ class JobSubmission(CoreModel):
|
|
|
254
273
|
submitted_at: datetime
|
|
255
274
|
last_processed_at: datetime
|
|
256
275
|
finished_at: Optional[datetime]
|
|
276
|
+
inactivity_secs: Optional[int]
|
|
257
277
|
status: JobStatus
|
|
258
278
|
termination_reason: Optional[JobTerminationReason]
|
|
259
279
|
termination_reason_message: Optional[str]
|
|
@@ -32,6 +32,9 @@ class VolumeConfiguration(CoreModel):
|
|
|
32
32
|
name: Annotated[Optional[str], Field(description="The volume name")] = None
|
|
33
33
|
backend: Annotated[BackendType, Field(description="The volume backend")]
|
|
34
34
|
region: Annotated[str, Field(description="The volume region")]
|
|
35
|
+
availability_zone: Annotated[
|
|
36
|
+
Optional[str], Field(description="The volume availability zone")
|
|
37
|
+
] = None
|
|
35
38
|
size: Annotated[
|
|
36
39
|
Optional[Memory],
|
|
37
40
|
Field(description="The volume size. Must be specified when creating new volumes"),
|
|
@@ -2,7 +2,7 @@ import atexit
|
|
|
2
2
|
import re
|
|
3
3
|
import time
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
5
|
+
from typing import Optional, Union
|
|
6
6
|
|
|
7
7
|
import psutil
|
|
8
8
|
|
|
@@ -14,6 +14,8 @@ from dstack._internal.core.services.ssh.ports import PortsLock
|
|
|
14
14
|
from dstack._internal.core.services.ssh.tunnel import SSHTunnel, ports_to_forwarded_sockets
|
|
15
15
|
from dstack._internal.utils.path import FilePath, PathLike
|
|
16
16
|
from dstack._internal.utils.ssh import (
|
|
17
|
+
default_ssh_config_path,
|
|
18
|
+
get_host_config,
|
|
17
19
|
include_ssh_config,
|
|
18
20
|
normalize_path,
|
|
19
21
|
update_ssh_config,
|
|
@@ -88,28 +90,63 @@ class SSHAttach:
|
|
|
88
90
|
},
|
|
89
91
|
)
|
|
90
92
|
self.ssh_proxy = ssh_proxy
|
|
91
|
-
|
|
92
|
-
|
|
93
|
+
|
|
94
|
+
hosts: dict[str, dict[str, Union[str, int, FilePath]]] = {}
|
|
95
|
+
self.hosts = hosts
|
|
96
|
+
|
|
97
|
+
if local_backend:
|
|
98
|
+
hosts[run_name] = {
|
|
93
99
|
"HostName": hostname,
|
|
94
|
-
"Port":
|
|
95
|
-
"User":
|
|
96
|
-
"IdentityFile": self.identity_file,
|
|
97
|
-
"IdentitiesOnly": "yes",
|
|
98
|
-
"StrictHostKeyChecking": "no",
|
|
99
|
-
"UserKnownHostsFile": "/dev/null",
|
|
100
|
-
}
|
|
101
|
-
else:
|
|
102
|
-
self.host_config = {
|
|
103
|
-
"HostName": ssh_proxy.hostname,
|
|
104
|
-
"Port": ssh_proxy.port,
|
|
105
|
-
"User": ssh_proxy.username,
|
|
100
|
+
"Port": container_ssh_port,
|
|
101
|
+
"User": container_user,
|
|
106
102
|
"IdentityFile": self.identity_file,
|
|
107
103
|
"IdentitiesOnly": "yes",
|
|
108
104
|
"StrictHostKeyChecking": "no",
|
|
109
105
|
"UserKnownHostsFile": "/dev/null",
|
|
110
106
|
}
|
|
111
|
-
|
|
112
|
-
|
|
107
|
+
elif dockerized:
|
|
108
|
+
if ssh_proxy is not None:
|
|
109
|
+
# SSH instance with jump host
|
|
110
|
+
# dstack has no IdentityFile for jump host, it must be either preconfigured
|
|
111
|
+
# in the ~/.ssh/config or loaded into ssh-agent
|
|
112
|
+
hosts[f"{run_name}-jump-host"] = {
|
|
113
|
+
"HostName": ssh_proxy.hostname,
|
|
114
|
+
"Port": ssh_proxy.port,
|
|
115
|
+
"User": ssh_proxy.username,
|
|
116
|
+
"StrictHostKeyChecking": "no",
|
|
117
|
+
"UserKnownHostsFile": "/dev/null",
|
|
118
|
+
}
|
|
119
|
+
jump_host_config = get_host_config(ssh_proxy.hostname, default_ssh_config_path)
|
|
120
|
+
jump_host_identity_files = jump_host_config.get("identityfile")
|
|
121
|
+
if jump_host_identity_files:
|
|
122
|
+
hosts[f"{run_name}-jump-host"].update(
|
|
123
|
+
{
|
|
124
|
+
"IdentityFile": jump_host_identity_files[0],
|
|
125
|
+
"IdentitiesOnly": "yes",
|
|
126
|
+
}
|
|
127
|
+
)
|
|
128
|
+
hosts[f"{run_name}-host"] = {
|
|
129
|
+
"HostName": hostname,
|
|
130
|
+
"Port": ssh_port,
|
|
131
|
+
"User": user,
|
|
132
|
+
"IdentityFile": self.identity_file,
|
|
133
|
+
"IdentitiesOnly": "yes",
|
|
134
|
+
"StrictHostKeyChecking": "no",
|
|
135
|
+
"UserKnownHostsFile": "/dev/null",
|
|
136
|
+
"ProxyJump": f"{run_name}-jump-host",
|
|
137
|
+
}
|
|
138
|
+
else:
|
|
139
|
+
# Regular SSH instance or VM-based cloud instance
|
|
140
|
+
hosts[f"{run_name}-host"] = {
|
|
141
|
+
"HostName": hostname,
|
|
142
|
+
"Port": ssh_port,
|
|
143
|
+
"User": user,
|
|
144
|
+
"IdentityFile": self.identity_file,
|
|
145
|
+
"IdentitiesOnly": "yes",
|
|
146
|
+
"StrictHostKeyChecking": "no",
|
|
147
|
+
"UserKnownHostsFile": "/dev/null",
|
|
148
|
+
}
|
|
149
|
+
hosts[run_name] = {
|
|
113
150
|
"HostName": "localhost",
|
|
114
151
|
"Port": container_ssh_port,
|
|
115
152
|
"User": container_user,
|
|
@@ -119,32 +156,41 @@ class SSHAttach:
|
|
|
119
156
|
"UserKnownHostsFile": "/dev/null",
|
|
120
157
|
"ProxyJump": f"{run_name}-host",
|
|
121
158
|
}
|
|
122
|
-
elif ssh_proxy is not None:
|
|
123
|
-
self.container_config = {
|
|
124
|
-
"HostName": hostname,
|
|
125
|
-
"Port": ssh_port,
|
|
126
|
-
"User": container_user,
|
|
127
|
-
"IdentityFile": self.identity_file,
|
|
128
|
-
"IdentitiesOnly": "yes",
|
|
129
|
-
"StrictHostKeyChecking": "no",
|
|
130
|
-
"UserKnownHostsFile": "/dev/null",
|
|
131
|
-
"ProxyJump": f"{run_name}-jump-host",
|
|
132
|
-
}
|
|
133
159
|
else:
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
160
|
+
if ssh_proxy is not None:
|
|
161
|
+
# Kubernetes
|
|
162
|
+
hosts[f"{run_name}-jump-host"] = {
|
|
163
|
+
"HostName": ssh_proxy.hostname,
|
|
164
|
+
"Port": ssh_proxy.port,
|
|
165
|
+
"User": ssh_proxy.username,
|
|
166
|
+
"IdentityFile": self.identity_file,
|
|
167
|
+
"IdentitiesOnly": "yes",
|
|
168
|
+
"StrictHostKeyChecking": "no",
|
|
169
|
+
"UserKnownHostsFile": "/dev/null",
|
|
170
|
+
}
|
|
171
|
+
hosts[run_name] = {
|
|
172
|
+
"HostName": hostname,
|
|
173
|
+
"Port": ssh_port,
|
|
174
|
+
"User": container_user,
|
|
175
|
+
"IdentityFile": self.identity_file,
|
|
176
|
+
"IdentitiesOnly": "yes",
|
|
177
|
+
"StrictHostKeyChecking": "no",
|
|
178
|
+
"UserKnownHostsFile": "/dev/null",
|
|
179
|
+
"ProxyJump": f"{run_name}-jump-host",
|
|
180
|
+
}
|
|
181
|
+
else:
|
|
182
|
+
# Container-based backends
|
|
183
|
+
hosts[run_name] = {
|
|
184
|
+
"HostName": hostname,
|
|
185
|
+
"Port": ssh_port,
|
|
186
|
+
"User": container_user,
|
|
187
|
+
"IdentityFile": self.identity_file,
|
|
188
|
+
"IdentitiesOnly": "yes",
|
|
189
|
+
"StrictHostKeyChecking": "no",
|
|
190
|
+
"UserKnownHostsFile": "/dev/null",
|
|
191
|
+
}
|
|
192
|
+
if get_ssh_client_info().supports_multiplexing:
|
|
193
|
+
hosts[run_name].update(
|
|
148
194
|
{
|
|
149
195
|
"ControlMaster": "auto",
|
|
150
196
|
"ControlPath": self.control_sock_path,
|
|
@@ -153,14 +199,8 @@ class SSHAttach:
|
|
|
153
199
|
|
|
154
200
|
def attach(self):
|
|
155
201
|
include_ssh_config(self.ssh_config_path)
|
|
156
|
-
|
|
157
|
-
update_ssh_config(self.ssh_config_path,
|
|
158
|
-
elif self.ssh_proxy is not None:
|
|
159
|
-
update_ssh_config(self.ssh_config_path, f"{self.run_name}-jump-host", self.host_config)
|
|
160
|
-
update_ssh_config(self.ssh_config_path, self.run_name, self.container_config)
|
|
161
|
-
else:
|
|
162
|
-
update_ssh_config(self.ssh_config_path, f"{self.run_name}-host", self.host_config)
|
|
163
|
-
update_ssh_config(self.ssh_config_path, self.run_name, self.container_config)
|
|
202
|
+
for host, options in self.hosts.items():
|
|
203
|
+
update_ssh_config(self.ssh_config_path, host, options)
|
|
164
204
|
|
|
165
205
|
max_retries = 10
|
|
166
206
|
self._ports_lock.release()
|
|
@@ -178,9 +218,8 @@ class SSHAttach:
|
|
|
178
218
|
|
|
179
219
|
def detach(self):
|
|
180
220
|
self.tunnel.close()
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
update_ssh_config(self.ssh_config_path, self.run_name, {})
|
|
221
|
+
for host in self.hosts:
|
|
222
|
+
update_ssh_config(self.ssh_config_path, host, {})
|
|
184
223
|
|
|
185
224
|
def __enter__(self):
|
|
186
225
|
self.attach()
|