dstack 0.18.40__py3-none-any.whl → 0.18.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -5
  2. dstack/_internal/cli/services/configurators/base.py +4 -2
  3. dstack/_internal/cli/services/configurators/fleet.py +21 -9
  4. dstack/_internal/cli/services/configurators/gateway.py +15 -0
  5. dstack/_internal/cli/services/configurators/run.py +6 -5
  6. dstack/_internal/cli/services/configurators/volume.py +15 -0
  7. dstack/_internal/cli/services/repos.py +3 -3
  8. dstack/_internal/cli/utils/fleet.py +44 -33
  9. dstack/_internal/cli/utils/run.py +27 -7
  10. dstack/_internal/cli/utils/volume.py +21 -9
  11. dstack/_internal/core/backends/aws/compute.py +92 -52
  12. dstack/_internal/core/backends/aws/resources.py +22 -12
  13. dstack/_internal/core/backends/azure/compute.py +2 -0
  14. dstack/_internal/core/backends/base/compute.py +20 -2
  15. dstack/_internal/core/backends/gcp/compute.py +30 -23
  16. dstack/_internal/core/backends/gcp/resources.py +0 -15
  17. dstack/_internal/core/backends/oci/compute.py +10 -5
  18. dstack/_internal/core/backends/oci/resources.py +23 -26
  19. dstack/_internal/core/backends/remote/provisioning.py +65 -27
  20. dstack/_internal/core/backends/runpod/compute.py +1 -0
  21. dstack/_internal/core/models/backends/azure.py +3 -1
  22. dstack/_internal/core/models/configurations.py +24 -1
  23. dstack/_internal/core/models/fleets.py +46 -0
  24. dstack/_internal/core/models/instances.py +5 -1
  25. dstack/_internal/core/models/pools.py +4 -1
  26. dstack/_internal/core/models/profiles.py +10 -4
  27. dstack/_internal/core/models/runs.py +20 -0
  28. dstack/_internal/core/models/volumes.py +3 -0
  29. dstack/_internal/core/services/ssh/attach.py +92 -53
  30. dstack/_internal/core/services/ssh/tunnel.py +58 -31
  31. dstack/_internal/proxy/gateway/routers/registry.py +2 -0
  32. dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
  33. dstack/_internal/proxy/gateway/services/registry.py +4 -0
  34. dstack/_internal/proxy/lib/models.py +3 -0
  35. dstack/_internal/proxy/lib/services/service_connection.py +8 -1
  36. dstack/_internal/server/background/tasks/process_instances.py +72 -33
  37. dstack/_internal/server/background/tasks/process_metrics.py +9 -9
  38. dstack/_internal/server/background/tasks/process_running_jobs.py +73 -26
  39. dstack/_internal/server/background/tasks/process_runs.py +2 -12
  40. dstack/_internal/server/background/tasks/process_submitted_jobs.py +109 -42
  41. dstack/_internal/server/background/tasks/process_terminating_jobs.py +1 -1
  42. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  43. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  44. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  45. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  46. dstack/_internal/server/models.py +10 -4
  47. dstack/_internal/server/routers/runs.py +1 -0
  48. dstack/_internal/server/schemas/runner.py +1 -0
  49. dstack/_internal/server/services/backends/configurators/azure.py +34 -8
  50. dstack/_internal/server/services/config.py +9 -0
  51. dstack/_internal/server/services/fleets.py +27 -2
  52. dstack/_internal/server/services/gateways/client.py +9 -1
  53. dstack/_internal/server/services/jobs/__init__.py +215 -43
  54. dstack/_internal/server/services/jobs/configurators/base.py +47 -2
  55. dstack/_internal/server/services/offers.py +91 -5
  56. dstack/_internal/server/services/pools.py +95 -11
  57. dstack/_internal/server/services/proxy/repo.py +17 -3
  58. dstack/_internal/server/services/runner/client.py +1 -1
  59. dstack/_internal/server/services/runner/ssh.py +33 -5
  60. dstack/_internal/server/services/runs.py +48 -179
  61. dstack/_internal/server/services/services/__init__.py +9 -1
  62. dstack/_internal/server/statics/index.html +1 -1
  63. dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
  64. dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
  65. dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
  66. dstack/_internal/server/testing/common.py +117 -52
  67. dstack/_internal/utils/common.py +22 -8
  68. dstack/_internal/utils/env.py +14 -0
  69. dstack/_internal/utils/ssh.py +1 -1
  70. dstack/api/server/_fleets.py +25 -1
  71. dstack/api/server/_runs.py +23 -2
  72. dstack/api/server/_volumes.py +12 -1
  73. dstack/version.py +1 -1
  74. {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/METADATA +1 -1
  75. {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/RECORD +98 -89
  76. tests/_internal/cli/services/configurators/test_profile.py +3 -3
  77. tests/_internal/core/services/ssh/test_tunnel.py +56 -4
  78. tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
  79. tests/_internal/server/background/tasks/test_process_instances.py +138 -20
  80. tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
  81. tests/_internal/server/background/tasks/test_process_running_jobs.py +192 -0
  82. tests/_internal/server/background/tasks/test_process_runs.py +27 -3
  83. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +48 -3
  84. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +126 -13
  85. tests/_internal/server/routers/test_fleets.py +15 -2
  86. tests/_internal/server/routers/test_pools.py +6 -0
  87. tests/_internal/server/routers/test_runs.py +27 -0
  88. tests/_internal/server/services/jobs/__init__.py +0 -0
  89. tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
  90. tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
  91. tests/_internal/server/services/test_pools.py +4 -0
  92. tests/_internal/server/services/test_runs.py +5 -41
  93. tests/_internal/utils/test_common.py +21 -0
  94. tests/_internal/utils/test_env.py +38 -0
  95. {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/LICENSE.md +0 -0
  96. {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/WHEEL +0 -0
  97. {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/entry_points.txt +0 -0
  98. {dstack-0.18.40.dist-info → dstack-0.18.41.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
1
  import io
2
2
  import json
3
3
  import time
4
- from contextlib import contextmanager
4
+ from contextlib import contextmanager, nullcontext
5
5
  from textwrap import dedent
6
- from typing import Any, Dict, Generator, List
6
+ from typing import Any, Dict, Generator, List, Optional
7
7
 
8
8
  import paramiko
9
9
  from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
@@ -17,6 +17,7 @@ from dstack._internal.core.models.instances import (
17
17
  Gpu,
18
18
  InstanceType,
19
19
  Resources,
20
+ SSHConnectionParams,
20
21
  )
21
22
  from dstack._internal.utils.gpu import (
22
23
  convert_amd_gpu_name,
@@ -262,35 +263,72 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
262
263
 
263
264
  @contextmanager
264
265
  def get_paramiko_connection(
265
- ssh_user: str, host: str, port: int, pkeys: List[paramiko.PKey]
266
+ ssh_user: str,
267
+ host: str,
268
+ port: int,
269
+ pkeys: List[paramiko.PKey],
270
+ proxy: Optional[SSHConnectionParams] = None,
271
+ proxy_pkeys: Optional[list[paramiko.PKey]] = None,
266
272
  ) -> Generator[paramiko.SSHClient, None, None]:
267
- with paramiko.SSHClient() as client:
268
- client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
269
- for pkey in pkeys:
270
- conn_url = f"{ssh_user}@{host}:{port}"
273
+ if proxy is not None:
274
+ if proxy_pkeys is None:
275
+ raise ProvisioningError("Missing proxy private keys")
276
+ proxy_ctx = get_paramiko_connection(
277
+ proxy.username, proxy.hostname, proxy.port, proxy_pkeys
278
+ )
279
+ else:
280
+ proxy_ctx = nullcontext()
281
+ conn_url = f"{ssh_user}@{host}:{port}"
282
+ with proxy_ctx as proxy_client, paramiko.SSHClient() as client:
283
+ proxy_channel: Optional[paramiko.Channel] = None
284
+ if proxy_client is not None:
271
285
  try:
272
- logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
273
- client.connect(
274
- username=ssh_user,
275
- hostname=host,
276
- port=port,
277
- pkey=pkey,
278
- look_for_keys=False,
279
- allow_agent=False,
280
- timeout=SSH_CONNECT_TIMEOUT,
286
+ proxy_channel = proxy_client.get_transport().open_channel(
287
+ "direct-tcpip", (host, port), ("", 0)
281
288
  )
282
- except paramiko.AuthenticationException:
283
- logger.debug(
284
- f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
285
- )
286
- continue # try next key
287
289
  except (paramiko.SSHException, OSError) as e:
288
- raise ProvisioningError(f"Connect failed: {e}") from e
289
- else:
290
+ raise ProvisioningError(f"Proxy channel failed: {e}") from e
291
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
292
+ for pkey in pkeys:
293
+ logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
294
+ connected = _paramiko_connect(client, ssh_user, host, port, pkey, proxy_channel)
295
+ if connected:
290
296
  yield client
291
297
  return
292
- else:
293
- keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
294
- raise ProvisioningError(
295
- f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
298
+ logger.debug(
299
+ f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
296
300
  )
301
+ keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
302
+ raise ProvisioningError(
303
+ f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
304
+ )
305
+
306
+
307
+ def _paramiko_connect(
308
+ client: paramiko.SSHClient,
309
+ user: str,
310
+ host: str,
311
+ port: int,
312
+ pkey: paramiko.PKey,
313
+ channel: Optional[paramiko.Channel] = None,
314
+ ) -> bool:
315
+ """
316
+ Returns `True` if connected, `False` if auth failed, and raises `ProvisioningError`
317
+ on other errors.
318
+ """
319
+ try:
320
+ client.connect(
321
+ username=user,
322
+ hostname=host,
323
+ port=port,
324
+ pkey=pkey,
325
+ look_for_keys=False,
326
+ allow_agent=False,
327
+ timeout=SSH_CONNECT_TIMEOUT,
328
+ sock=channel,
329
+ )
330
+ return True
331
+ except paramiko.AuthenticationException:
332
+ return False
333
+ except (paramiko.SSHException, OSError) as e:
334
+ raise ProvisioningError(f"Connect failed: {e}") from e
@@ -113,6 +113,7 @@ class RunpodCompute(Compute):
113
113
  bid_per_gpu=bid_per_gpu,
114
114
  network_volume_id=network_volume_id,
115
115
  volume_mount_path=volume_mount_path,
116
+ env={"RUNPOD_POD_USER": "0"},
116
117
  )
117
118
 
118
119
  instance_id = resp["id"]
@@ -11,6 +11,7 @@ class AzureConfigInfo(CoreModel):
11
11
  type: Literal["azure"] = "azure"
12
12
  tenant_id: str
13
13
  subscription_id: str
14
+ resource_group: Optional[str] = None
14
15
  locations: Optional[List[str]] = None
15
16
  vpc_ids: Optional[Dict[str, str]] = None
16
17
  public_ips: Optional[bool] = None
@@ -48,6 +49,7 @@ class AzureConfigInfoWithCredsPartial(CoreModel):
48
49
  creds: Optional[AnyAzureCreds]
49
50
  tenant_id: Optional[str]
50
51
  subscription_id: Optional[str]
52
+ resource_group: Optional[str]
51
53
  locations: Optional[List[str]]
52
54
  vpc_ids: Optional[Dict[str, str]]
53
55
  public_ips: Optional[bool]
@@ -63,4 +65,4 @@ class AzureConfigValues(CoreModel):
63
65
 
64
66
 
65
67
  class AzureStoredConfig(AzureConfigInfo):
66
- resource_group: str
68
+ resource_group: str = ""
@@ -10,7 +10,7 @@ from dstack._internal.core.models.common import CoreModel, Duration, RegistryAut
10
10
  from dstack._internal.core.models.envs import Env
11
11
  from dstack._internal.core.models.fleets import FleetConfiguration
12
12
  from dstack._internal.core.models.gateways import GatewayConfiguration
13
- from dstack._internal.core.models.profiles import ProfileParams
13
+ from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
14
14
  from dstack._internal.core.models.repos.base import Repo
15
15
  from dstack._internal.core.models.repos.virtual import VirtualRepo
16
16
  from dstack._internal.core.models.resources import Range, ResourcesSpec
@@ -212,6 +212,29 @@ class DevEnvironmentConfigurationParams(CoreModel):
212
212
  ide: Annotated[Literal["vscode"], Field(description="The IDE to run")]
213
213
  version: Annotated[Optional[str], Field(description="The version of the IDE")]
214
214
  init: Annotated[CommandsList, Field(description="The bash commands to run on startup")] = []
215
+ inactivity_duration: Annotated[
216
+ Optional[Union[Literal["off"], int, bool, str]],
217
+ Field(
218
+ description=(
219
+ "The maximum amount of time the dev environment can be inactive"
220
+ " (e.g., `2h`, `1d`, etc)."
221
+ " After it elapses, the dev environment is automatically stopped."
222
+ " Inactivity is defined as the absence of SSH connections to the"
223
+ " dev environment, including VS Code connections, `ssh <run name>`"
224
+ " shells, and attached `dstack apply` or `dstack attach` commands."
225
+ " Use `off` for unlimited duration. Defaults to `off`"
226
+ )
227
+ ),
228
+ ]
229
+
230
+ @validator("inactivity_duration", pre=True, allow_reuse=True)
231
+ def parse_inactivity_duration(
232
+ cls, v: Optional[Union[Literal["off"], int, bool, str]]
233
+ ) -> Optional[int]:
234
+ v = parse_off_duration(v)
235
+ if isinstance(v, int):
236
+ return v
237
+ return None
215
238
 
216
239
 
217
240
  class DevEnvironmentConfiguration(
@@ -39,6 +39,14 @@ class InstanceGroupPlacement(str, Enum):
39
39
  CLUSTER = "cluster"
40
40
 
41
41
 
42
+ class SSHProxyParams(CoreModel):
43
+ hostname: Annotated[str, Field(description="The IP address or domain of proxy host")]
44
+ port: Annotated[Optional[int], Field(description="The SSH port of proxy host")] = None
45
+ user: Annotated[str, Field(description="The user to log in with for proxy host")]
46
+ identity_file: Annotated[str, Field(description="The private key to use for proxy host")]
47
+ ssh_key: Optional[SSHKey] = None
48
+
49
+
42
50
  class SSHHostParams(CoreModel):
43
51
  hostname: Annotated[str, Field(description="The IP address or domain to connect to")]
44
52
  port: Annotated[
@@ -50,6 +58,9 @@ class SSHHostParams(CoreModel):
50
58
  identity_file: Annotated[
51
59
  Optional[str], Field(description="The private key to use for this host")
52
60
  ] = None
61
+ proxy_jump: Annotated[
62
+ Optional[SSHProxyParams], Field(description="The SSH proxy configuration for this host")
63
+ ] = None
53
64
  internal_ip: Annotated[
54
65
  Optional[str],
55
66
  Field(
@@ -61,6 +72,19 @@ class SSHHostParams(CoreModel):
61
72
  ] = None
62
73
  ssh_key: Optional[SSHKey] = None
63
74
 
75
+ blocks: Annotated[
76
+ Union[Literal["auto"], int],
77
+ Field(
78
+ description=(
79
+ "The amount of blocks to split the instance into, a number or `auto`."
80
+ " `auto` means as many as possible."
81
+ " The number of GPUs and CPUs must be divisible by the number of blocks."
82
+ " Defaults to `1`, i.e. do not split"
83
+ ),
84
+ ge=1,
85
+ ),
86
+ ] = 1
87
+
64
88
  @validator("internal_ip")
65
89
  def validate_internal_ip(cls, value):
66
90
  if value is None:
@@ -83,6 +107,9 @@ class SSHParams(CoreModel):
83
107
  Optional[str], Field(description="The private key to use for all hosts")
84
108
  ] = None
85
109
  ssh_key: Optional[SSHKey] = None
110
+ proxy_jump: Annotated[
111
+ Optional[SSHProxyParams], Field(description="The SSH proxy configuration for all hosts")
112
+ ] = None
86
113
  hosts: Annotated[
87
114
  List[Union[SSHHostParams, str]],
88
115
  Field(
@@ -142,6 +169,19 @@ class InstanceGroupParams(CoreModel):
142
169
  Field(description="The resources requirements"),
143
170
  ] = ResourcesSpec()
144
171
 
172
+ blocks: Annotated[
173
+ Union[Literal["auto"], int],
174
+ Field(
175
+ description=(
176
+ "The amount of blocks to split the instance into, a number or `auto`."
177
+ " `auto` means as many as possible."
178
+ " The number of GPUs and CPUs must be divisible by the number of blocks."
179
+ " Defaults to `1`, i.e. do not split"
180
+ ),
181
+ ge=1,
182
+ ),
183
+ ] = 1
184
+
145
185
  backends: Annotated[
146
186
  Optional[List[BackendType]],
147
187
  Field(description="The backends to consider for provisioning (e.g., `[aws, gcp]`)"),
@@ -152,6 +192,12 @@ class InstanceGroupParams(CoreModel):
152
192
  description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
153
193
  ),
154
194
  ] = None
195
+ availability_zones: Annotated[
196
+ Optional[List[str]],
197
+ Field(
198
+ description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
199
+ ),
200
+ ] = None
155
201
  instance_types: Annotated[
156
202
  Optional[List[str]],
157
203
  Field(
@@ -92,6 +92,8 @@ class RemoteConnectionInfo(CoreModel):
92
92
  port: int
93
93
  ssh_user: str
94
94
  ssh_keys: List[SSHKey]
95
+ ssh_proxy: Optional[SSHConnectionParams] = None
96
+ ssh_proxy_keys: Optional[list[SSHKey]] = None
95
97
  env: Env = Env()
96
98
 
97
99
 
@@ -101,7 +103,6 @@ class InstanceConfiguration(CoreModel):
101
103
  user: str # dstack user name
102
104
  ssh_keys: List[SSHKey]
103
105
  instance_id: Optional[str] = None
104
- availability_zone: Optional[str] = None
105
106
  placement_group_name: Optional[str] = None
106
107
  reservation: Optional[str] = None
107
108
  volumes: Optional[List[Volume]] = None
@@ -140,7 +141,10 @@ class InstanceOffer(CoreModel):
140
141
 
141
142
  class InstanceOfferWithAvailability(InstanceOffer):
142
143
  availability: InstanceAvailability
144
+ availability_zones: Optional[List[str]] = None
143
145
  instance_runtime: InstanceRuntime = InstanceRuntime.SHIM
146
+ blocks: int = 1
147
+ total_blocks: int = 1
144
148
 
145
149
 
146
150
  class InstanceStatus(str, Enum):
@@ -25,14 +25,17 @@ class Instance(CoreModel):
25
25
  fleet_name: Optional[str] = None
26
26
  instance_num: int
27
27
  pool_name: Optional[str] = None
28
- job_name: Optional[str] = None
28
+ job_name: Optional[str] = None # deprecated, always None (instance can have more than one job)
29
29
  hostname: Optional[str] = None
30
30
  status: InstanceStatus
31
31
  unreachable: bool = False
32
32
  termination_reason: Optional[str] = None
33
33
  created: datetime.datetime
34
34
  region: Optional[str] = None
35
+ availability_zone: Optional[str] = None
35
36
  price: Optional[float] = None
37
+ total_blocks: Optional[int] = None
38
+ busy_blocks: int = 0
36
39
 
37
40
 
38
41
  class PoolInstances(CoreModel):
@@ -40,15 +40,15 @@ def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
40
40
  return Duration.parse(v)
41
41
 
42
42
 
43
- def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
43
+ def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
44
44
  return parse_off_duration(v)
45
45
 
46
46
 
47
- def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
47
+ def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
48
48
  return parse_off_duration(v)
49
49
 
50
50
 
51
- def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
51
+ def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
52
52
  if v == "off" or v is False:
53
53
  return "off"
54
54
  if v is True:
@@ -123,6 +123,12 @@ class ProfileParams(CoreModel):
123
123
  description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
124
124
  ),
125
125
  ]
126
+ availability_zones: Annotated[
127
+ Optional[List[str]],
128
+ Field(
129
+ description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
130
+ ),
131
+ ] = None
126
132
  instance_types: Annotated[
127
133
  Optional[List[str]],
128
134
  Field(
@@ -162,7 +168,7 @@ class ProfileParams(CoreModel):
162
168
  Optional[Union[Literal["off"], str, int, bool]],
163
169
  Field(
164
170
  description=(
165
- "The maximum duration of a run gracefull stopping."
171
+ "The maximum duration of a run graceful stopping."
166
172
  " After it elapses, the run is automatically forced stopped."
167
173
  " This includes force detaching volumes used by the run."
168
174
  " Use `off` for unlimited duration. Defaults to `5m`"
@@ -27,6 +27,7 @@ from dstack._internal.core.models.profiles import (
27
27
  from dstack._internal.core.models.repos import AnyRunRepoData
28
28
  from dstack._internal.core.models.resources import Memory, ResourcesSpec
29
29
  from dstack._internal.core.models.unix import UnixUser
30
+ from dstack._internal.core.models.volumes import MountPoint
30
31
  from dstack._internal.utils import common as common_utils
31
32
  from dstack._internal.utils.common import format_pretty_duration
32
33
 
@@ -112,6 +113,7 @@ class JobTerminationReason(str, Enum):
112
113
  DONE_BY_RUNNER = "done_by_runner"
113
114
  ABORTED_BY_USER = "aborted_by_user"
114
115
  TERMINATED_BY_SERVER = "terminated_by_server"
116
+ INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
115
117
  # Set by the runner
116
118
  CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
117
119
  PORTS_BINDING_FAILED = "ports_binding_failed"
@@ -132,6 +134,7 @@ class JobTerminationReason(str, Enum):
132
134
  self.DONE_BY_RUNNER: JobStatus.DONE,
133
135
  self.ABORTED_BY_USER: JobStatus.ABORTED,
134
136
  self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
137
+ self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
135
138
  self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
136
139
  self.PORTS_BINDING_FAILED: JobStatus.FAILED,
137
140
  self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
190
193
  registry_auth: Optional[RegistryAuth]
191
194
  requirements: Requirements
192
195
  retry: Optional[Retry]
196
+ volumes: Optional[List[MountPoint]] = None
193
197
  # For backward compatibility with 0.18.x when retry_policy was required.
194
198
  # TODO: remove in 0.19
195
199
  retry_policy: ProfileRetryPolicy = ProfileRetryPolicy(retry=False)
@@ -231,6 +235,17 @@ class JobProvisioningData(CoreModel):
231
235
 
232
236
 
233
237
  class JobRuntimeData(CoreModel):
238
+ """
239
+ Holds various information only available after the job is submitted, such as:
240
+ * offer (depends on the instance)
241
+ * volumes used by the job
242
+ * resource constraints for container (depend on the instance)
243
+ * port mapping (reported by the shim only after the container is started)
244
+
245
+ Some fields are mutable, for example, `ports` only available when the shim starts
246
+ the container.
247
+ """
248
+
234
249
  network_mode: NetworkMode
235
250
  # GPU, CPU, memory resource shares. None means all available (no limit)
236
251
  gpu: Optional[int] = None
@@ -240,6 +255,10 @@ class JobRuntimeData(CoreModel):
240
255
  # None if data is not yet available (on vm-based backends and ssh instances)
241
256
  # or not applicable (container-based backends)
242
257
  ports: Optional[dict[int, int]] = None
258
+ # List of volumes used by the job
259
+ volume_names: Optional[list[str]] = None # None for backward compalibility
260
+ # Virtual shared offer
261
+ offer: Optional[InstanceOfferWithAvailability] = None # None for backward compalibility
243
262
 
244
263
 
245
264
  class ClusterInfo(CoreModel):
@@ -254,6 +273,7 @@ class JobSubmission(CoreModel):
254
273
  submitted_at: datetime
255
274
  last_processed_at: datetime
256
275
  finished_at: Optional[datetime]
276
+ inactivity_secs: Optional[int]
257
277
  status: JobStatus
258
278
  termination_reason: Optional[JobTerminationReason]
259
279
  termination_reason_message: Optional[str]
@@ -32,6 +32,9 @@ class VolumeConfiguration(CoreModel):
32
32
  name: Annotated[Optional[str], Field(description="The volume name")] = None
33
33
  backend: Annotated[BackendType, Field(description="The volume backend")]
34
34
  region: Annotated[str, Field(description="The volume region")]
35
+ availability_zone: Annotated[
36
+ Optional[str], Field(description="The volume availability zone")
37
+ ] = None
35
38
  size: Annotated[
36
39
  Optional[Memory],
37
40
  Field(description="The volume size. Must be specified when creating new volumes"),
@@ -2,7 +2,7 @@ import atexit
2
2
  import re
3
3
  import time
4
4
  from pathlib import Path
5
- from typing import Optional
5
+ from typing import Optional, Union
6
6
 
7
7
  import psutil
8
8
 
@@ -14,6 +14,8 @@ from dstack._internal.core.services.ssh.ports import PortsLock
14
14
  from dstack._internal.core.services.ssh.tunnel import SSHTunnel, ports_to_forwarded_sockets
15
15
  from dstack._internal.utils.path import FilePath, PathLike
16
16
  from dstack._internal.utils.ssh import (
17
+ default_ssh_config_path,
18
+ get_host_config,
17
19
  include_ssh_config,
18
20
  normalize_path,
19
21
  update_ssh_config,
@@ -88,28 +90,63 @@ class SSHAttach:
88
90
  },
89
91
  )
90
92
  self.ssh_proxy = ssh_proxy
91
- if ssh_proxy is None:
92
- self.host_config = {
93
+
94
+ hosts: dict[str, dict[str, Union[str, int, FilePath]]] = {}
95
+ self.hosts = hosts
96
+
97
+ if local_backend:
98
+ hosts[run_name] = {
93
99
  "HostName": hostname,
94
- "Port": ssh_port,
95
- "User": user if dockerized else container_user,
96
- "IdentityFile": self.identity_file,
97
- "IdentitiesOnly": "yes",
98
- "StrictHostKeyChecking": "no",
99
- "UserKnownHostsFile": "/dev/null",
100
- }
101
- else:
102
- self.host_config = {
103
- "HostName": ssh_proxy.hostname,
104
- "Port": ssh_proxy.port,
105
- "User": ssh_proxy.username,
100
+ "Port": container_ssh_port,
101
+ "User": container_user,
106
102
  "IdentityFile": self.identity_file,
107
103
  "IdentitiesOnly": "yes",
108
104
  "StrictHostKeyChecking": "no",
109
105
  "UserKnownHostsFile": "/dev/null",
110
106
  }
111
- if dockerized and not local_backend:
112
- self.container_config = {
107
+ elif dockerized:
108
+ if ssh_proxy is not None:
109
+ # SSH instance with jump host
110
+ # dstack has no IdentityFile for jump host, it must be either preconfigured
111
+ # in the ~/.ssh/config or loaded into ssh-agent
112
+ hosts[f"{run_name}-jump-host"] = {
113
+ "HostName": ssh_proxy.hostname,
114
+ "Port": ssh_proxy.port,
115
+ "User": ssh_proxy.username,
116
+ "StrictHostKeyChecking": "no",
117
+ "UserKnownHostsFile": "/dev/null",
118
+ }
119
+ jump_host_config = get_host_config(ssh_proxy.hostname, default_ssh_config_path)
120
+ jump_host_identity_files = jump_host_config.get("identityfile")
121
+ if jump_host_identity_files:
122
+ hosts[f"{run_name}-jump-host"].update(
123
+ {
124
+ "IdentityFile": jump_host_identity_files[0],
125
+ "IdentitiesOnly": "yes",
126
+ }
127
+ )
128
+ hosts[f"{run_name}-host"] = {
129
+ "HostName": hostname,
130
+ "Port": ssh_port,
131
+ "User": user,
132
+ "IdentityFile": self.identity_file,
133
+ "IdentitiesOnly": "yes",
134
+ "StrictHostKeyChecking": "no",
135
+ "UserKnownHostsFile": "/dev/null",
136
+ "ProxyJump": f"{run_name}-jump-host",
137
+ }
138
+ else:
139
+ # Regular SSH instance or VM-based cloud instance
140
+ hosts[f"{run_name}-host"] = {
141
+ "HostName": hostname,
142
+ "Port": ssh_port,
143
+ "User": user,
144
+ "IdentityFile": self.identity_file,
145
+ "IdentitiesOnly": "yes",
146
+ "StrictHostKeyChecking": "no",
147
+ "UserKnownHostsFile": "/dev/null",
148
+ }
149
+ hosts[run_name] = {
113
150
  "HostName": "localhost",
114
151
  "Port": container_ssh_port,
115
152
  "User": container_user,
@@ -119,32 +156,41 @@ class SSHAttach:
119
156
  "UserKnownHostsFile": "/dev/null",
120
157
  "ProxyJump": f"{run_name}-host",
121
158
  }
122
- elif ssh_proxy is not None:
123
- self.container_config = {
124
- "HostName": hostname,
125
- "Port": ssh_port,
126
- "User": container_user,
127
- "IdentityFile": self.identity_file,
128
- "IdentitiesOnly": "yes",
129
- "StrictHostKeyChecking": "no",
130
- "UserKnownHostsFile": "/dev/null",
131
- "ProxyJump": f"{run_name}-jump-host",
132
- }
133
159
  else:
134
- self.container_config = None
135
- if local_backend:
136
- self.container_config = None
137
- self.host_config = {
138
- "HostName": hostname,
139
- "Port": container_ssh_port,
140
- "User": container_user,
141
- "IdentityFile": self.identity_file,
142
- "IdentitiesOnly": "yes",
143
- "StrictHostKeyChecking": "no",
144
- "UserKnownHostsFile": "/dev/null",
145
- }
146
- if self.container_config is not None and get_ssh_client_info().supports_multiplexing:
147
- self.container_config.update(
160
+ if ssh_proxy is not None:
161
+ # Kubernetes
162
+ hosts[f"{run_name}-jump-host"] = {
163
+ "HostName": ssh_proxy.hostname,
164
+ "Port": ssh_proxy.port,
165
+ "User": ssh_proxy.username,
166
+ "IdentityFile": self.identity_file,
167
+ "IdentitiesOnly": "yes",
168
+ "StrictHostKeyChecking": "no",
169
+ "UserKnownHostsFile": "/dev/null",
170
+ }
171
+ hosts[run_name] = {
172
+ "HostName": hostname,
173
+ "Port": ssh_port,
174
+ "User": container_user,
175
+ "IdentityFile": self.identity_file,
176
+ "IdentitiesOnly": "yes",
177
+ "StrictHostKeyChecking": "no",
178
+ "UserKnownHostsFile": "/dev/null",
179
+ "ProxyJump": f"{run_name}-jump-host",
180
+ }
181
+ else:
182
+ # Container-based backends
183
+ hosts[run_name] = {
184
+ "HostName": hostname,
185
+ "Port": ssh_port,
186
+ "User": container_user,
187
+ "IdentityFile": self.identity_file,
188
+ "IdentitiesOnly": "yes",
189
+ "StrictHostKeyChecking": "no",
190
+ "UserKnownHostsFile": "/dev/null",
191
+ }
192
+ if get_ssh_client_info().supports_multiplexing:
193
+ hosts[run_name].update(
148
194
  {
149
195
  "ControlMaster": "auto",
150
196
  "ControlPath": self.control_sock_path,
@@ -153,14 +199,8 @@ class SSHAttach:
153
199
 
154
200
  def attach(self):
155
201
  include_ssh_config(self.ssh_config_path)
156
- if self.container_config is None:
157
- update_ssh_config(self.ssh_config_path, self.run_name, self.host_config)
158
- elif self.ssh_proxy is not None:
159
- update_ssh_config(self.ssh_config_path, f"{self.run_name}-jump-host", self.host_config)
160
- update_ssh_config(self.ssh_config_path, self.run_name, self.container_config)
161
- else:
162
- update_ssh_config(self.ssh_config_path, f"{self.run_name}-host", self.host_config)
163
- update_ssh_config(self.ssh_config_path, self.run_name, self.container_config)
202
+ for host, options in self.hosts.items():
203
+ update_ssh_config(self.ssh_config_path, host, options)
164
204
 
165
205
  max_retries = 10
166
206
  self._ports_lock.release()
@@ -178,9 +218,8 @@ class SSHAttach:
178
218
 
179
219
  def detach(self):
180
220
  self.tunnel.close()
181
- update_ssh_config(self.ssh_config_path, f"{self.run_name}-jump-host", {})
182
- update_ssh_config(self.ssh_config_path, f"{self.run_name}-host", {})
183
- update_ssh_config(self.ssh_config_path, self.run_name, {})
221
+ for host in self.hosts:
222
+ update_ssh_config(self.ssh_config_path, host, {})
184
223
 
185
224
  def __enter__(self):
186
225
  self.attach()