dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dstack/_internal/cli/commands/apply.py +8 -5
- dstack/_internal/cli/services/configurators/base.py +4 -2
- dstack/_internal/cli/services/configurators/fleet.py +21 -9
- dstack/_internal/cli/services/configurators/gateway.py +15 -0
- dstack/_internal/cli/services/configurators/run.py +6 -5
- dstack/_internal/cli/services/configurators/volume.py +15 -0
- dstack/_internal/cli/services/repos.py +3 -3
- dstack/_internal/cli/utils/fleet.py +44 -33
- dstack/_internal/cli/utils/run.py +27 -7
- dstack/_internal/cli/utils/volume.py +30 -9
- dstack/_internal/core/backends/aws/compute.py +94 -53
- dstack/_internal/core/backends/aws/resources.py +22 -12
- dstack/_internal/core/backends/azure/compute.py +2 -0
- dstack/_internal/core/backends/base/compute.py +20 -2
- dstack/_internal/core/backends/gcp/compute.py +32 -24
- dstack/_internal/core/backends/gcp/resources.py +0 -15
- dstack/_internal/core/backends/oci/compute.py +10 -5
- dstack/_internal/core/backends/oci/resources.py +23 -26
- dstack/_internal/core/backends/remote/provisioning.py +65 -27
- dstack/_internal/core/backends/runpod/compute.py +1 -0
- dstack/_internal/core/models/backends/azure.py +3 -1
- dstack/_internal/core/models/configurations.py +24 -1
- dstack/_internal/core/models/fleets.py +46 -0
- dstack/_internal/core/models/instances.py +5 -1
- dstack/_internal/core/models/pools.py +4 -1
- dstack/_internal/core/models/profiles.py +10 -4
- dstack/_internal/core/models/runs.py +23 -3
- dstack/_internal/core/models/volumes.py +26 -0
- dstack/_internal/core/services/ssh/attach.py +92 -53
- dstack/_internal/core/services/ssh/tunnel.py +58 -31
- dstack/_internal/proxy/gateway/routers/registry.py +2 -0
- dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
- dstack/_internal/proxy/gateway/services/registry.py +4 -0
- dstack/_internal/proxy/lib/models.py +3 -0
- dstack/_internal/proxy/lib/services/service_connection.py +8 -1
- dstack/_internal/server/background/tasks/process_instances.py +73 -35
- dstack/_internal/server/background/tasks/process_metrics.py +9 -9
- dstack/_internal/server/background/tasks/process_running_jobs.py +77 -26
- dstack/_internal/server/background/tasks/process_runs.py +2 -12
- dstack/_internal/server/background/tasks/process_submitted_jobs.py +121 -49
- dstack/_internal/server/background/tasks/process_terminating_jobs.py +14 -3
- dstack/_internal/server/background/tasks/process_volumes.py +11 -1
- dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
- dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
- dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
- dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
- dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
- dstack/_internal/server/models.py +27 -23
- dstack/_internal/server/routers/runs.py +1 -0
- dstack/_internal/server/schemas/runner.py +1 -0
- dstack/_internal/server/services/backends/configurators/azure.py +34 -8
- dstack/_internal/server/services/config.py +9 -0
- dstack/_internal/server/services/fleets.py +32 -3
- dstack/_internal/server/services/gateways/client.py +9 -1
- dstack/_internal/server/services/jobs/__init__.py +217 -45
- dstack/_internal/server/services/jobs/configurators/base.py +47 -2
- dstack/_internal/server/services/offers.py +96 -10
- dstack/_internal/server/services/pools.py +98 -14
- dstack/_internal/server/services/proxy/repo.py +17 -3
- dstack/_internal/server/services/runner/client.py +9 -6
- dstack/_internal/server/services/runner/ssh.py +33 -5
- dstack/_internal/server/services/runs.py +48 -179
- dstack/_internal/server/services/services/__init__.py +9 -1
- dstack/_internal/server/services/volumes.py +68 -9
- dstack/_internal/server/statics/index.html +1 -1
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
- dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
- dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
- dstack/_internal/server/testing/common.py +130 -61
- dstack/_internal/utils/common.py +22 -8
- dstack/_internal/utils/env.py +14 -0
- dstack/_internal/utils/ssh.py +1 -1
- dstack/api/server/_fleets.py +25 -1
- dstack/api/server/_runs.py +23 -2
- dstack/api/server/_volumes.py +12 -1
- dstack/version.py +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/METADATA +1 -1
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/RECORD +104 -93
- tests/_internal/cli/services/configurators/test_profile.py +3 -3
- tests/_internal/core/services/ssh/test_tunnel.py +56 -4
- tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
- tests/_internal/server/background/tasks/test_process_instances.py +138 -20
- tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
- tests/_internal/server/background/tasks/test_process_running_jobs.py +193 -0
- tests/_internal/server/background/tasks/test_process_runs.py +27 -3
- tests/_internal/server/background/tasks/test_process_submitted_jobs.py +53 -6
- tests/_internal/server/background/tasks/test_process_terminating_jobs.py +135 -17
- tests/_internal/server/routers/test_fleets.py +15 -2
- tests/_internal/server/routers/test_pools.py +6 -0
- tests/_internal/server/routers/test_runs.py +27 -0
- tests/_internal/server/routers/test_volumes.py +9 -2
- tests/_internal/server/services/jobs/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
- tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
- tests/_internal/server/services/runner/test_client.py +22 -3
- tests/_internal/server/services/test_offers.py +167 -0
- tests/_internal/server/services/test_pools.py +109 -1
- tests/_internal/server/services/test_runs.py +5 -41
- tests/_internal/utils/test_common.py +21 -0
- tests/_internal/utils/test_env.py +38 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/LICENSE.md +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/WHEEL +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/entry_points.txt +0 -0
- {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/top_level.txt +0 -0
|
@@ -203,34 +203,29 @@ def check_availability_in_domain(
|
|
|
203
203
|
return available
|
|
204
204
|
|
|
205
205
|
|
|
206
|
-
def
|
|
206
|
+
def check_availability_per_domain(
|
|
207
207
|
shape_names: Iterable[str],
|
|
208
208
|
shapes_quota: ShapesQuota,
|
|
209
209
|
region: OCIRegionClient,
|
|
210
210
|
compartment_id: str,
|
|
211
|
-
) -> Set[str]:
|
|
212
|
-
"""
|
|
213
|
-
Returns a subset of `shape_names` with only the shapes available in at least
|
|
214
|
-
one availability domain within `region`.
|
|
215
|
-
"""
|
|
216
|
-
|
|
211
|
+
) -> Dict[str, Set[str]]:
|
|
217
212
|
all_shapes = set(shape_names)
|
|
218
|
-
|
|
213
|
+
available_shapes_per_domain = {}
|
|
219
214
|
|
|
220
215
|
for availability_domain in region.availability_domains:
|
|
221
216
|
shapes_to_check = {
|
|
222
217
|
shape
|
|
223
|
-
for shape in all_shapes
|
|
218
|
+
for shape in all_shapes
|
|
224
219
|
if shapes_quota.is_within_domain_quota(shape, availability_domain.name)
|
|
225
220
|
}
|
|
226
|
-
|
|
221
|
+
available_shapes_per_domain[availability_domain.name] = check_availability_in_domain(
|
|
227
222
|
shape_names=shapes_to_check,
|
|
228
223
|
availability_domain_name=availability_domain.name,
|
|
229
224
|
client=region.compute_client,
|
|
230
225
|
compartment_id=compartment_id,
|
|
231
226
|
)
|
|
232
227
|
|
|
233
|
-
return
|
|
228
|
+
return available_shapes_per_domain
|
|
234
229
|
|
|
235
230
|
|
|
236
231
|
def get_shapes_availability(
|
|
@@ -239,12 +234,11 @@ def get_shapes_availability(
|
|
|
239
234
|
regions: Mapping[str, OCIRegionClient],
|
|
240
235
|
compartment_id: str,
|
|
241
236
|
executor: Executor,
|
|
242
|
-
) -> Dict[str,
|
|
237
|
+
) -> Dict[str, Dict[str, List[str]]]:
|
|
243
238
|
"""
|
|
244
|
-
Returns
|
|
245
|
-
|
|
239
|
+
Returns availability domains where shapes are available as regions->shapes->availability_domains mapping.
|
|
240
|
+
Only shapes from `offers` are checked.
|
|
246
241
|
"""
|
|
247
|
-
|
|
248
242
|
shape_names_per_region = {region: set() for region in regions}
|
|
249
243
|
for offer in offers:
|
|
250
244
|
if shapes_quota.is_within_region_quota(offer.instance.name, offer.region):
|
|
@@ -253,7 +247,7 @@ def get_shapes_availability(
|
|
|
253
247
|
future_to_region_name = {}
|
|
254
248
|
for region_name, shape_names in shape_names_per_region.items():
|
|
255
249
|
future = executor.submit(
|
|
256
|
-
|
|
250
|
+
check_availability_per_domain,
|
|
257
251
|
shape_names,
|
|
258
252
|
shapes_quota,
|
|
259
253
|
regions[region_name],
|
|
@@ -263,29 +257,32 @@ def get_shapes_availability(
|
|
|
263
257
|
|
|
264
258
|
result = {}
|
|
265
259
|
for future in as_completed(future_to_region_name):
|
|
266
|
-
|
|
267
|
-
|
|
260
|
+
domains_to_shape_names = future.result()
|
|
261
|
+
shape_names_to_domains = {}
|
|
262
|
+
for domain, shape_names in domains_to_shape_names.items():
|
|
263
|
+
for shape_name in shape_names:
|
|
264
|
+
shape_names_to_domains.setdefault(shape_name, []).append(domain)
|
|
265
|
+
result[future_to_region_name[future]] = shape_names_to_domains
|
|
268
266
|
|
|
269
267
|
return result
|
|
270
268
|
|
|
271
269
|
|
|
272
|
-
def
|
|
270
|
+
def get_available_domains(
|
|
273
271
|
shape_name: str, shapes_quota: ShapesQuota, region: OCIRegionClient, compartment_id: str
|
|
274
|
-
) ->
|
|
272
|
+
) -> List[str]:
|
|
275
273
|
"""
|
|
276
|
-
Returns the
|
|
277
|
-
`shape_name` is available
|
|
278
|
-
`shapes_quota` in all domains.
|
|
274
|
+
Returns the names of all availability domains in `region` in which
|
|
275
|
+
`shape_name` is available and within `shapes_quota`.
|
|
279
276
|
"""
|
|
280
|
-
|
|
277
|
+
domains = []
|
|
281
278
|
for domain in region.availability_domains:
|
|
282
279
|
if shapes_quota.is_within_domain_quota(
|
|
283
280
|
shape_name, domain.name
|
|
284
281
|
) and check_availability_in_domain(
|
|
285
282
|
{shape_name}, domain.name, region.compute_client, compartment_id
|
|
286
283
|
):
|
|
287
|
-
|
|
288
|
-
return
|
|
284
|
+
domains.append(domain.name)
|
|
285
|
+
return domains
|
|
289
286
|
|
|
290
287
|
|
|
291
288
|
def get_instance_vnic(
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import io
|
|
2
2
|
import json
|
|
3
3
|
import time
|
|
4
|
-
from contextlib import contextmanager
|
|
4
|
+
from contextlib import contextmanager, nullcontext
|
|
5
5
|
from textwrap import dedent
|
|
6
|
-
from typing import Any, Dict, Generator, List
|
|
6
|
+
from typing import Any, Dict, Generator, List, Optional
|
|
7
7
|
|
|
8
8
|
import paramiko
|
|
9
9
|
from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
|
|
@@ -17,6 +17,7 @@ from dstack._internal.core.models.instances import (
|
|
|
17
17
|
Gpu,
|
|
18
18
|
InstanceType,
|
|
19
19
|
Resources,
|
|
20
|
+
SSHConnectionParams,
|
|
20
21
|
)
|
|
21
22
|
from dstack._internal.utils.gpu import (
|
|
22
23
|
convert_amd_gpu_name,
|
|
@@ -262,35 +263,72 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
|
|
|
262
263
|
|
|
263
264
|
@contextmanager
|
|
264
265
|
def get_paramiko_connection(
|
|
265
|
-
ssh_user: str,
|
|
266
|
+
ssh_user: str,
|
|
267
|
+
host: str,
|
|
268
|
+
port: int,
|
|
269
|
+
pkeys: List[paramiko.PKey],
|
|
270
|
+
proxy: Optional[SSHConnectionParams] = None,
|
|
271
|
+
proxy_pkeys: Optional[list[paramiko.PKey]] = None,
|
|
266
272
|
) -> Generator[paramiko.SSHClient, None, None]:
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
273
|
+
if proxy is not None:
|
|
274
|
+
if proxy_pkeys is None:
|
|
275
|
+
raise ProvisioningError("Missing proxy private keys")
|
|
276
|
+
proxy_ctx = get_paramiko_connection(
|
|
277
|
+
proxy.username, proxy.hostname, proxy.port, proxy_pkeys
|
|
278
|
+
)
|
|
279
|
+
else:
|
|
280
|
+
proxy_ctx = nullcontext()
|
|
281
|
+
conn_url = f"{ssh_user}@{host}:{port}"
|
|
282
|
+
with proxy_ctx as proxy_client, paramiko.SSHClient() as client:
|
|
283
|
+
proxy_channel: Optional[paramiko.Channel] = None
|
|
284
|
+
if proxy_client is not None:
|
|
271
285
|
try:
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
username=ssh_user,
|
|
275
|
-
hostname=host,
|
|
276
|
-
port=port,
|
|
277
|
-
pkey=pkey,
|
|
278
|
-
look_for_keys=False,
|
|
279
|
-
allow_agent=False,
|
|
280
|
-
timeout=SSH_CONNECT_TIMEOUT,
|
|
286
|
+
proxy_channel = proxy_client.get_transport().open_channel(
|
|
287
|
+
"direct-tcpip", (host, port), ("", 0)
|
|
281
288
|
)
|
|
282
|
-
except paramiko.AuthenticationException:
|
|
283
|
-
logger.debug(
|
|
284
|
-
f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
|
|
285
|
-
)
|
|
286
|
-
continue # try next key
|
|
287
289
|
except (paramiko.SSHException, OSError) as e:
|
|
288
|
-
raise ProvisioningError(f"
|
|
289
|
-
|
|
290
|
+
raise ProvisioningError(f"Proxy channel failed: {e}") from e
|
|
291
|
+
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
292
|
+
for pkey in pkeys:
|
|
293
|
+
logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
|
|
294
|
+
connected = _paramiko_connect(client, ssh_user, host, port, pkey, proxy_channel)
|
|
295
|
+
if connected:
|
|
290
296
|
yield client
|
|
291
297
|
return
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
raise ProvisioningError(
|
|
295
|
-
f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
|
|
298
|
+
logger.debug(
|
|
299
|
+
f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
|
|
296
300
|
)
|
|
301
|
+
keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
|
|
302
|
+
raise ProvisioningError(
|
|
303
|
+
f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _paramiko_connect(
|
|
308
|
+
client: paramiko.SSHClient,
|
|
309
|
+
user: str,
|
|
310
|
+
host: str,
|
|
311
|
+
port: int,
|
|
312
|
+
pkey: paramiko.PKey,
|
|
313
|
+
channel: Optional[paramiko.Channel] = None,
|
|
314
|
+
) -> bool:
|
|
315
|
+
"""
|
|
316
|
+
Returns `True` if connected, `False` if auth failed, and raises `ProvisioningError`
|
|
317
|
+
on other errors.
|
|
318
|
+
"""
|
|
319
|
+
try:
|
|
320
|
+
client.connect(
|
|
321
|
+
username=user,
|
|
322
|
+
hostname=host,
|
|
323
|
+
port=port,
|
|
324
|
+
pkey=pkey,
|
|
325
|
+
look_for_keys=False,
|
|
326
|
+
allow_agent=False,
|
|
327
|
+
timeout=SSH_CONNECT_TIMEOUT,
|
|
328
|
+
sock=channel,
|
|
329
|
+
)
|
|
330
|
+
return True
|
|
331
|
+
except paramiko.AuthenticationException:
|
|
332
|
+
return False
|
|
333
|
+
except (paramiko.SSHException, OSError) as e:
|
|
334
|
+
raise ProvisioningError(f"Connect failed: {e}") from e
|
|
@@ -11,6 +11,7 @@ class AzureConfigInfo(CoreModel):
|
|
|
11
11
|
type: Literal["azure"] = "azure"
|
|
12
12
|
tenant_id: str
|
|
13
13
|
subscription_id: str
|
|
14
|
+
resource_group: Optional[str] = None
|
|
14
15
|
locations: Optional[List[str]] = None
|
|
15
16
|
vpc_ids: Optional[Dict[str, str]] = None
|
|
16
17
|
public_ips: Optional[bool] = None
|
|
@@ -48,6 +49,7 @@ class AzureConfigInfoWithCredsPartial(CoreModel):
|
|
|
48
49
|
creds: Optional[AnyAzureCreds]
|
|
49
50
|
tenant_id: Optional[str]
|
|
50
51
|
subscription_id: Optional[str]
|
|
52
|
+
resource_group: Optional[str]
|
|
51
53
|
locations: Optional[List[str]]
|
|
52
54
|
vpc_ids: Optional[Dict[str, str]]
|
|
53
55
|
public_ips: Optional[bool]
|
|
@@ -63,4 +65,4 @@ class AzureConfigValues(CoreModel):
|
|
|
63
65
|
|
|
64
66
|
|
|
65
67
|
class AzureStoredConfig(AzureConfigInfo):
|
|
66
|
-
resource_group: str
|
|
68
|
+
resource_group: str = ""
|
|
@@ -10,7 +10,7 @@ from dstack._internal.core.models.common import CoreModel, Duration, RegistryAut
|
|
|
10
10
|
from dstack._internal.core.models.envs import Env
|
|
11
11
|
from dstack._internal.core.models.fleets import FleetConfiguration
|
|
12
12
|
from dstack._internal.core.models.gateways import GatewayConfiguration
|
|
13
|
-
from dstack._internal.core.models.profiles import ProfileParams
|
|
13
|
+
from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
|
|
14
14
|
from dstack._internal.core.models.repos.base import Repo
|
|
15
15
|
from dstack._internal.core.models.repos.virtual import VirtualRepo
|
|
16
16
|
from dstack._internal.core.models.resources import Range, ResourcesSpec
|
|
@@ -212,6 +212,29 @@ class DevEnvironmentConfigurationParams(CoreModel):
|
|
|
212
212
|
ide: Annotated[Literal["vscode"], Field(description="The IDE to run")]
|
|
213
213
|
version: Annotated[Optional[str], Field(description="The version of the IDE")]
|
|
214
214
|
init: Annotated[CommandsList, Field(description="The bash commands to run on startup")] = []
|
|
215
|
+
inactivity_duration: Annotated[
|
|
216
|
+
Optional[Union[Literal["off"], int, bool, str]],
|
|
217
|
+
Field(
|
|
218
|
+
description=(
|
|
219
|
+
"The maximum amount of time the dev environment can be inactive"
|
|
220
|
+
" (e.g., `2h`, `1d`, etc)."
|
|
221
|
+
" After it elapses, the dev environment is automatically stopped."
|
|
222
|
+
" Inactivity is defined as the absence of SSH connections to the"
|
|
223
|
+
" dev environment, including VS Code connections, `ssh <run name>`"
|
|
224
|
+
" shells, and attached `dstack apply` or `dstack attach` commands."
|
|
225
|
+
" Use `off` for unlimited duration. Defaults to `off`"
|
|
226
|
+
)
|
|
227
|
+
),
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
@validator("inactivity_duration", pre=True, allow_reuse=True)
|
|
231
|
+
def parse_inactivity_duration(
|
|
232
|
+
cls, v: Optional[Union[Literal["off"], int, bool, str]]
|
|
233
|
+
) -> Optional[int]:
|
|
234
|
+
v = parse_off_duration(v)
|
|
235
|
+
if isinstance(v, int):
|
|
236
|
+
return v
|
|
237
|
+
return None
|
|
215
238
|
|
|
216
239
|
|
|
217
240
|
class DevEnvironmentConfiguration(
|
|
@@ -39,6 +39,14 @@ class InstanceGroupPlacement(str, Enum):
|
|
|
39
39
|
CLUSTER = "cluster"
|
|
40
40
|
|
|
41
41
|
|
|
42
|
+
class SSHProxyParams(CoreModel):
|
|
43
|
+
hostname: Annotated[str, Field(description="The IP address or domain of proxy host")]
|
|
44
|
+
port: Annotated[Optional[int], Field(description="The SSH port of proxy host")] = None
|
|
45
|
+
user: Annotated[str, Field(description="The user to log in with for proxy host")]
|
|
46
|
+
identity_file: Annotated[str, Field(description="The private key to use for proxy host")]
|
|
47
|
+
ssh_key: Optional[SSHKey] = None
|
|
48
|
+
|
|
49
|
+
|
|
42
50
|
class SSHHostParams(CoreModel):
|
|
43
51
|
hostname: Annotated[str, Field(description="The IP address or domain to connect to")]
|
|
44
52
|
port: Annotated[
|
|
@@ -50,6 +58,9 @@ class SSHHostParams(CoreModel):
|
|
|
50
58
|
identity_file: Annotated[
|
|
51
59
|
Optional[str], Field(description="The private key to use for this host")
|
|
52
60
|
] = None
|
|
61
|
+
proxy_jump: Annotated[
|
|
62
|
+
Optional[SSHProxyParams], Field(description="The SSH proxy configuration for this host")
|
|
63
|
+
] = None
|
|
53
64
|
internal_ip: Annotated[
|
|
54
65
|
Optional[str],
|
|
55
66
|
Field(
|
|
@@ -61,6 +72,19 @@ class SSHHostParams(CoreModel):
|
|
|
61
72
|
] = None
|
|
62
73
|
ssh_key: Optional[SSHKey] = None
|
|
63
74
|
|
|
75
|
+
blocks: Annotated[
|
|
76
|
+
Union[Literal["auto"], int],
|
|
77
|
+
Field(
|
|
78
|
+
description=(
|
|
79
|
+
"The amount of blocks to split the instance into, a number or `auto`."
|
|
80
|
+
" `auto` means as many as possible."
|
|
81
|
+
" The number of GPUs and CPUs must be divisible by the number of blocks."
|
|
82
|
+
" Defaults to `1`, i.e. do not split"
|
|
83
|
+
),
|
|
84
|
+
ge=1,
|
|
85
|
+
),
|
|
86
|
+
] = 1
|
|
87
|
+
|
|
64
88
|
@validator("internal_ip")
|
|
65
89
|
def validate_internal_ip(cls, value):
|
|
66
90
|
if value is None:
|
|
@@ -83,6 +107,9 @@ class SSHParams(CoreModel):
|
|
|
83
107
|
Optional[str], Field(description="The private key to use for all hosts")
|
|
84
108
|
] = None
|
|
85
109
|
ssh_key: Optional[SSHKey] = None
|
|
110
|
+
proxy_jump: Annotated[
|
|
111
|
+
Optional[SSHProxyParams], Field(description="The SSH proxy configuration for all hosts")
|
|
112
|
+
] = None
|
|
86
113
|
hosts: Annotated[
|
|
87
114
|
List[Union[SSHHostParams, str]],
|
|
88
115
|
Field(
|
|
@@ -142,6 +169,19 @@ class InstanceGroupParams(CoreModel):
|
|
|
142
169
|
Field(description="The resources requirements"),
|
|
143
170
|
] = ResourcesSpec()
|
|
144
171
|
|
|
172
|
+
blocks: Annotated[
|
|
173
|
+
Union[Literal["auto"], int],
|
|
174
|
+
Field(
|
|
175
|
+
description=(
|
|
176
|
+
"The amount of blocks to split the instance into, a number or `auto`."
|
|
177
|
+
" `auto` means as many as possible."
|
|
178
|
+
" The number of GPUs and CPUs must be divisible by the number of blocks."
|
|
179
|
+
" Defaults to `1`, i.e. do not split"
|
|
180
|
+
),
|
|
181
|
+
ge=1,
|
|
182
|
+
),
|
|
183
|
+
] = 1
|
|
184
|
+
|
|
145
185
|
backends: Annotated[
|
|
146
186
|
Optional[List[BackendType]],
|
|
147
187
|
Field(description="The backends to consider for provisioning (e.g., `[aws, gcp]`)"),
|
|
@@ -152,6 +192,12 @@ class InstanceGroupParams(CoreModel):
|
|
|
152
192
|
description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
|
|
153
193
|
),
|
|
154
194
|
] = None
|
|
195
|
+
availability_zones: Annotated[
|
|
196
|
+
Optional[List[str]],
|
|
197
|
+
Field(
|
|
198
|
+
description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
|
|
199
|
+
),
|
|
200
|
+
] = None
|
|
155
201
|
instance_types: Annotated[
|
|
156
202
|
Optional[List[str]],
|
|
157
203
|
Field(
|
|
@@ -92,6 +92,8 @@ class RemoteConnectionInfo(CoreModel):
|
|
|
92
92
|
port: int
|
|
93
93
|
ssh_user: str
|
|
94
94
|
ssh_keys: List[SSHKey]
|
|
95
|
+
ssh_proxy: Optional[SSHConnectionParams] = None
|
|
96
|
+
ssh_proxy_keys: Optional[list[SSHKey]] = None
|
|
95
97
|
env: Env = Env()
|
|
96
98
|
|
|
97
99
|
|
|
@@ -101,7 +103,6 @@ class InstanceConfiguration(CoreModel):
|
|
|
101
103
|
user: str # dstack user name
|
|
102
104
|
ssh_keys: List[SSHKey]
|
|
103
105
|
instance_id: Optional[str] = None
|
|
104
|
-
availability_zone: Optional[str] = None
|
|
105
106
|
placement_group_name: Optional[str] = None
|
|
106
107
|
reservation: Optional[str] = None
|
|
107
108
|
volumes: Optional[List[Volume]] = None
|
|
@@ -140,7 +141,10 @@ class InstanceOffer(CoreModel):
|
|
|
140
141
|
|
|
141
142
|
class InstanceOfferWithAvailability(InstanceOffer):
|
|
142
143
|
availability: InstanceAvailability
|
|
144
|
+
availability_zones: Optional[List[str]] = None
|
|
143
145
|
instance_runtime: InstanceRuntime = InstanceRuntime.SHIM
|
|
146
|
+
blocks: int = 1
|
|
147
|
+
total_blocks: int = 1
|
|
144
148
|
|
|
145
149
|
|
|
146
150
|
class InstanceStatus(str, Enum):
|
|
@@ -25,14 +25,17 @@ class Instance(CoreModel):
|
|
|
25
25
|
fleet_name: Optional[str] = None
|
|
26
26
|
instance_num: int
|
|
27
27
|
pool_name: Optional[str] = None
|
|
28
|
-
job_name: Optional[str] = None
|
|
28
|
+
job_name: Optional[str] = None # deprecated, always None (instance can have more than one job)
|
|
29
29
|
hostname: Optional[str] = None
|
|
30
30
|
status: InstanceStatus
|
|
31
31
|
unreachable: bool = False
|
|
32
32
|
termination_reason: Optional[str] = None
|
|
33
33
|
created: datetime.datetime
|
|
34
34
|
region: Optional[str] = None
|
|
35
|
+
availability_zone: Optional[str] = None
|
|
35
36
|
price: Optional[float] = None
|
|
37
|
+
total_blocks: Optional[int] = None
|
|
38
|
+
busy_blocks: int = 0
|
|
36
39
|
|
|
37
40
|
|
|
38
41
|
class PoolInstances(CoreModel):
|
|
@@ -40,15 +40,15 @@ def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
|
|
|
40
40
|
return Duration.parse(v)
|
|
41
41
|
|
|
42
42
|
|
|
43
|
-
def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int
|
|
43
|
+
def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
|
|
44
44
|
return parse_off_duration(v)
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int
|
|
47
|
+
def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
|
|
48
48
|
return parse_off_duration(v)
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int
|
|
51
|
+
def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
|
|
52
52
|
if v == "off" or v is False:
|
|
53
53
|
return "off"
|
|
54
54
|
if v is True:
|
|
@@ -123,6 +123,12 @@ class ProfileParams(CoreModel):
|
|
|
123
123
|
description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
|
|
124
124
|
),
|
|
125
125
|
]
|
|
126
|
+
availability_zones: Annotated[
|
|
127
|
+
Optional[List[str]],
|
|
128
|
+
Field(
|
|
129
|
+
description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
|
|
130
|
+
),
|
|
131
|
+
] = None
|
|
126
132
|
instance_types: Annotated[
|
|
127
133
|
Optional[List[str]],
|
|
128
134
|
Field(
|
|
@@ -162,7 +168,7 @@ class ProfileParams(CoreModel):
|
|
|
162
168
|
Optional[Union[Literal["off"], str, int, bool]],
|
|
163
169
|
Field(
|
|
164
170
|
description=(
|
|
165
|
-
"The maximum duration of a run
|
|
171
|
+
"The maximum duration of a run graceful stopping."
|
|
166
172
|
" After it elapses, the run is automatically forced stopped."
|
|
167
173
|
" This includes force detaching volumes used by the run."
|
|
168
174
|
" Use `off` for unlimited duration. Defaults to `5m`"
|
|
@@ -27,6 +27,7 @@ from dstack._internal.core.models.profiles import (
|
|
|
27
27
|
from dstack._internal.core.models.repos import AnyRunRepoData
|
|
28
28
|
from dstack._internal.core.models.resources import Memory, ResourcesSpec
|
|
29
29
|
from dstack._internal.core.models.unix import UnixUser
|
|
30
|
+
from dstack._internal.core.models.volumes import MountPoint
|
|
30
31
|
from dstack._internal.utils import common as common_utils
|
|
31
32
|
from dstack._internal.utils.common import format_pretty_duration
|
|
32
33
|
|
|
@@ -112,6 +113,7 @@ class JobTerminationReason(str, Enum):
|
|
|
112
113
|
DONE_BY_RUNNER = "done_by_runner"
|
|
113
114
|
ABORTED_BY_USER = "aborted_by_user"
|
|
114
115
|
TERMINATED_BY_SERVER = "terminated_by_server"
|
|
116
|
+
INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
|
|
115
117
|
# Set by the runner
|
|
116
118
|
CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
|
|
117
119
|
PORTS_BINDING_FAILED = "ports_binding_failed"
|
|
@@ -132,6 +134,7 @@ class JobTerminationReason(str, Enum):
|
|
|
132
134
|
self.DONE_BY_RUNNER: JobStatus.DONE,
|
|
133
135
|
self.ABORTED_BY_USER: JobStatus.ABORTED,
|
|
134
136
|
self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
|
|
137
|
+
self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
|
|
135
138
|
self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
|
|
136
139
|
self.PORTS_BINDING_FAILED: JobStatus.FAILED,
|
|
137
140
|
self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
|
|
@@ -147,9 +150,9 @@ class JobTerminationReason(str, Enum):
|
|
|
147
150
|
class Requirements(CoreModel):
|
|
148
151
|
# TODO: Make requirements' fields required
|
|
149
152
|
resources: ResourcesSpec
|
|
150
|
-
max_price: Optional[float]
|
|
151
|
-
spot: Optional[bool]
|
|
152
|
-
reservation: Optional[str]
|
|
153
|
+
max_price: Optional[float] = None
|
|
154
|
+
spot: Optional[bool] = None
|
|
155
|
+
reservation: Optional[str] = None
|
|
153
156
|
|
|
154
157
|
def pretty_format(self, resources_only: bool = False):
|
|
155
158
|
res = self.resources.pretty_format()
|
|
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
|
|
|
190
193
|
registry_auth: Optional[RegistryAuth]
|
|
191
194
|
requirements: Requirements
|
|
192
195
|
retry: Optional[Retry]
|
|
196
|
+
volumes: Optional[List[MountPoint]] = None
|
|
193
197
|
# For backward compatibility with 0.18.x when retry_policy was required.
|
|
194
198
|
# TODO: remove in 0.19
|
|
195
199
|
retry_policy: ProfileRetryPolicy = ProfileRetryPolicy(retry=False)
|
|
@@ -231,6 +235,17 @@ class JobProvisioningData(CoreModel):
|
|
|
231
235
|
|
|
232
236
|
|
|
233
237
|
class JobRuntimeData(CoreModel):
|
|
238
|
+
"""
|
|
239
|
+
Holds various information only available after the job is submitted, such as:
|
|
240
|
+
* offer (depends on the instance)
|
|
241
|
+
* volumes used by the job
|
|
242
|
+
* resource constraints for container (depend on the instance)
|
|
243
|
+
* port mapping (reported by the shim only after the container is started)
|
|
244
|
+
|
|
245
|
+
Some fields are mutable, for example, `ports` only available when the shim starts
|
|
246
|
+
the container.
|
|
247
|
+
"""
|
|
248
|
+
|
|
234
249
|
network_mode: NetworkMode
|
|
235
250
|
# GPU, CPU, memory resource shares. None means all available (no limit)
|
|
236
251
|
gpu: Optional[int] = None
|
|
@@ -240,6 +255,10 @@ class JobRuntimeData(CoreModel):
|
|
|
240
255
|
# None if data is not yet available (on vm-based backends and ssh instances)
|
|
241
256
|
# or not applicable (container-based backends)
|
|
242
257
|
ports: Optional[dict[int, int]] = None
|
|
258
|
+
# List of volumes used by the job
|
|
259
|
+
volume_names: Optional[list[str]] = None # None for backward compalibility
|
|
260
|
+
# Virtual shared offer
|
|
261
|
+
offer: Optional[InstanceOfferWithAvailability] = None # None for backward compalibility
|
|
243
262
|
|
|
244
263
|
|
|
245
264
|
class ClusterInfo(CoreModel):
|
|
@@ -254,6 +273,7 @@ class JobSubmission(CoreModel):
|
|
|
254
273
|
submitted_at: datetime
|
|
255
274
|
last_processed_at: datetime
|
|
256
275
|
finished_at: Optional[datetime]
|
|
276
|
+
inactivity_secs: Optional[int]
|
|
257
277
|
status: JobStatus
|
|
258
278
|
termination_reason: Optional[JobTerminationReason]
|
|
259
279
|
termination_reason_message: Optional[str]
|
|
@@ -32,6 +32,9 @@ class VolumeConfiguration(CoreModel):
|
|
|
32
32
|
name: Annotated[Optional[str], Field(description="The volume name")] = None
|
|
33
33
|
backend: Annotated[BackendType, Field(description="The volume backend")]
|
|
34
34
|
region: Annotated[str, Field(description="The volume region")]
|
|
35
|
+
availability_zone: Annotated[
|
|
36
|
+
Optional[str], Field(description="The volume availability zone")
|
|
37
|
+
] = None
|
|
35
38
|
size: Annotated[
|
|
36
39
|
Optional[Memory],
|
|
37
40
|
Field(description="The volume size. Must be specified when creating new volumes"),
|
|
@@ -68,6 +71,18 @@ class VolumeAttachmentData(CoreModel):
|
|
|
68
71
|
device_name: Optional[str] = None
|
|
69
72
|
|
|
70
73
|
|
|
74
|
+
class VolumeInstance(CoreModel):
|
|
75
|
+
name: str
|
|
76
|
+
fleet_name: Optional[str] = None
|
|
77
|
+
instance_num: int
|
|
78
|
+
instance_id: Optional[str] = None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class VolumeAttachment(CoreModel):
|
|
82
|
+
instance: VolumeInstance
|
|
83
|
+
attachment_data: Optional[VolumeAttachmentData] = None
|
|
84
|
+
|
|
85
|
+
|
|
71
86
|
class Volume(CoreModel):
|
|
72
87
|
id: uuid.UUID
|
|
73
88
|
name: str
|
|
@@ -83,8 +98,19 @@ class Volume(CoreModel):
|
|
|
83
98
|
deleted: bool
|
|
84
99
|
volume_id: Optional[str] = None # id of the volume in the cloud
|
|
85
100
|
provisioning_data: Optional[VolumeProvisioningData] = None
|
|
101
|
+
attachments: Optional[List[VolumeAttachment]] = None
|
|
102
|
+
# attachment_data is deprecated in favor of attachments.
|
|
103
|
+
# It's only set for volumes that were attached before attachments.
|
|
86
104
|
attachment_data: Optional[VolumeAttachmentData] = None
|
|
87
105
|
|
|
106
|
+
def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]:
|
|
107
|
+
if self.attachments is not None:
|
|
108
|
+
for attachment in self.attachments:
|
|
109
|
+
if attachment.instance.instance_id == instance_id:
|
|
110
|
+
return attachment.attachment_data
|
|
111
|
+
# volume was attached before attachments were introduced
|
|
112
|
+
return self.attachment_data
|
|
113
|
+
|
|
88
114
|
|
|
89
115
|
class VolumePlan(CoreModel):
|
|
90
116
|
project_name: str
|