dstack 0.18.40rc1__py3-none-any.whl → 0.18.42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. dstack/_internal/cli/commands/apply.py +8 -5
  2. dstack/_internal/cli/services/configurators/base.py +4 -2
  3. dstack/_internal/cli/services/configurators/fleet.py +21 -9
  4. dstack/_internal/cli/services/configurators/gateway.py +15 -0
  5. dstack/_internal/cli/services/configurators/run.py +6 -5
  6. dstack/_internal/cli/services/configurators/volume.py +15 -0
  7. dstack/_internal/cli/services/repos.py +3 -3
  8. dstack/_internal/cli/utils/fleet.py +44 -33
  9. dstack/_internal/cli/utils/run.py +27 -7
  10. dstack/_internal/cli/utils/volume.py +30 -9
  11. dstack/_internal/core/backends/aws/compute.py +94 -53
  12. dstack/_internal/core/backends/aws/resources.py +22 -12
  13. dstack/_internal/core/backends/azure/compute.py +2 -0
  14. dstack/_internal/core/backends/base/compute.py +20 -2
  15. dstack/_internal/core/backends/gcp/compute.py +32 -24
  16. dstack/_internal/core/backends/gcp/resources.py +0 -15
  17. dstack/_internal/core/backends/oci/compute.py +10 -5
  18. dstack/_internal/core/backends/oci/resources.py +23 -26
  19. dstack/_internal/core/backends/remote/provisioning.py +65 -27
  20. dstack/_internal/core/backends/runpod/compute.py +1 -0
  21. dstack/_internal/core/models/backends/azure.py +3 -1
  22. dstack/_internal/core/models/configurations.py +24 -1
  23. dstack/_internal/core/models/fleets.py +46 -0
  24. dstack/_internal/core/models/instances.py +5 -1
  25. dstack/_internal/core/models/pools.py +4 -1
  26. dstack/_internal/core/models/profiles.py +10 -4
  27. dstack/_internal/core/models/runs.py +23 -3
  28. dstack/_internal/core/models/volumes.py +26 -0
  29. dstack/_internal/core/services/ssh/attach.py +92 -53
  30. dstack/_internal/core/services/ssh/tunnel.py +58 -31
  31. dstack/_internal/proxy/gateway/routers/registry.py +2 -0
  32. dstack/_internal/proxy/gateway/schemas/registry.py +2 -0
  33. dstack/_internal/proxy/gateway/services/registry.py +4 -0
  34. dstack/_internal/proxy/lib/models.py +3 -0
  35. dstack/_internal/proxy/lib/services/service_connection.py +8 -1
  36. dstack/_internal/server/background/tasks/process_instances.py +73 -35
  37. dstack/_internal/server/background/tasks/process_metrics.py +9 -9
  38. dstack/_internal/server/background/tasks/process_running_jobs.py +77 -26
  39. dstack/_internal/server/background/tasks/process_runs.py +2 -12
  40. dstack/_internal/server/background/tasks/process_submitted_jobs.py +121 -49
  41. dstack/_internal/server/background/tasks/process_terminating_jobs.py +14 -3
  42. dstack/_internal/server/background/tasks/process_volumes.py +11 -1
  43. dstack/_internal/server/migrations/versions/1338b788b612_reverse_job_instance_relationship.py +71 -0
  44. dstack/_internal/server/migrations/versions/1e76fb0dde87_add_jobmodel_inactivity_secs.py +32 -0
  45. dstack/_internal/server/migrations/versions/51d45659d574_add_instancemodel_blocks_fields.py +43 -0
  46. dstack/_internal/server/migrations/versions/63c3f19cb184_add_jobterminationreason_inactivity_.py +83 -0
  47. dstack/_internal/server/migrations/versions/a751ef183f27_move_attachment_data_to_volumes_.py +34 -0
  48. dstack/_internal/server/models.py +27 -23
  49. dstack/_internal/server/routers/runs.py +1 -0
  50. dstack/_internal/server/schemas/runner.py +1 -0
  51. dstack/_internal/server/services/backends/configurators/azure.py +34 -8
  52. dstack/_internal/server/services/config.py +9 -0
  53. dstack/_internal/server/services/fleets.py +32 -3
  54. dstack/_internal/server/services/gateways/client.py +9 -1
  55. dstack/_internal/server/services/jobs/__init__.py +217 -45
  56. dstack/_internal/server/services/jobs/configurators/base.py +47 -2
  57. dstack/_internal/server/services/offers.py +96 -10
  58. dstack/_internal/server/services/pools.py +98 -14
  59. dstack/_internal/server/services/proxy/repo.py +17 -3
  60. dstack/_internal/server/services/runner/client.py +9 -6
  61. dstack/_internal/server/services/runner/ssh.py +33 -5
  62. dstack/_internal/server/services/runs.py +48 -179
  63. dstack/_internal/server/services/services/__init__.py +9 -1
  64. dstack/_internal/server/services/volumes.py +68 -9
  65. dstack/_internal/server/statics/index.html +1 -1
  66. dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js → main-2ac66bfcbd2e39830b88.js} +30 -31
  67. dstack/_internal/server/statics/{main-11ec5e4a00ea6ec833e3.js.map → main-2ac66bfcbd2e39830b88.js.map} +1 -1
  68. dstack/_internal/server/statics/{main-fc56d1f4af8e57522a1c.css → main-ad5150a441de98cd8987.css} +1 -1
  69. dstack/_internal/server/testing/common.py +130 -61
  70. dstack/_internal/utils/common.py +22 -8
  71. dstack/_internal/utils/env.py +14 -0
  72. dstack/_internal/utils/ssh.py +1 -1
  73. dstack/api/server/_fleets.py +25 -1
  74. dstack/api/server/_runs.py +23 -2
  75. dstack/api/server/_volumes.py +12 -1
  76. dstack/version.py +1 -1
  77. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/METADATA +1 -1
  78. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/RECORD +104 -93
  79. tests/_internal/cli/services/configurators/test_profile.py +3 -3
  80. tests/_internal/core/services/ssh/test_tunnel.py +56 -4
  81. tests/_internal/proxy/gateway/routers/test_registry.py +30 -7
  82. tests/_internal/server/background/tasks/test_process_instances.py +138 -20
  83. tests/_internal/server/background/tasks/test_process_metrics.py +12 -0
  84. tests/_internal/server/background/tasks/test_process_running_jobs.py +193 -0
  85. tests/_internal/server/background/tasks/test_process_runs.py +27 -3
  86. tests/_internal/server/background/tasks/test_process_submitted_jobs.py +53 -6
  87. tests/_internal/server/background/tasks/test_process_terminating_jobs.py +135 -17
  88. tests/_internal/server/routers/test_fleets.py +15 -2
  89. tests/_internal/server/routers/test_pools.py +6 -0
  90. tests/_internal/server/routers/test_runs.py +27 -0
  91. tests/_internal/server/routers/test_volumes.py +9 -2
  92. tests/_internal/server/services/jobs/__init__.py +0 -0
  93. tests/_internal/server/services/jobs/configurators/__init__.py +0 -0
  94. tests/_internal/server/services/jobs/configurators/test_base.py +72 -0
  95. tests/_internal/server/services/runner/test_client.py +22 -3
  96. tests/_internal/server/services/test_offers.py +167 -0
  97. tests/_internal/server/services/test_pools.py +109 -1
  98. tests/_internal/server/services/test_runs.py +5 -41
  99. tests/_internal/utils/test_common.py +21 -0
  100. tests/_internal/utils/test_env.py +38 -0
  101. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/LICENSE.md +0 -0
  102. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/WHEEL +0 -0
  103. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/entry_points.txt +0 -0
  104. {dstack-0.18.40rc1.dist-info → dstack-0.18.42.dist-info}/top_level.txt +0 -0
@@ -203,34 +203,29 @@ def check_availability_in_domain(
203
203
  return available
204
204
 
205
205
 
206
- def check_availability_in_region(
206
+ def check_availability_per_domain(
207
207
  shape_names: Iterable[str],
208
208
  shapes_quota: ShapesQuota,
209
209
  region: OCIRegionClient,
210
210
  compartment_id: str,
211
- ) -> Set[str]:
212
- """
213
- Returns a subset of `shape_names` with only the shapes available in at least
214
- one availability domain within `region`.
215
- """
216
-
211
+ ) -> Dict[str, Set[str]]:
217
212
  all_shapes = set(shape_names)
218
- available_shapes = set()
213
+ available_shapes_per_domain = {}
219
214
 
220
215
  for availability_domain in region.availability_domains:
221
216
  shapes_to_check = {
222
217
  shape
223
- for shape in all_shapes - available_shapes
218
+ for shape in all_shapes
224
219
  if shapes_quota.is_within_domain_quota(shape, availability_domain.name)
225
220
  }
226
- available_shapes |= check_availability_in_domain(
221
+ available_shapes_per_domain[availability_domain.name] = check_availability_in_domain(
227
222
  shape_names=shapes_to_check,
228
223
  availability_domain_name=availability_domain.name,
229
224
  client=region.compute_client,
230
225
  compartment_id=compartment_id,
231
226
  )
232
227
 
233
- return available_shapes
228
+ return available_shapes_per_domain
234
229
 
235
230
 
236
231
  def get_shapes_availability(
@@ -239,12 +234,11 @@ def get_shapes_availability(
239
234
  regions: Mapping[str, OCIRegionClient],
240
235
  compartment_id: str,
241
236
  executor: Executor,
242
- ) -> Dict[str, Set[str]]:
237
+ ) -> Dict[str, Dict[str, List[str]]]:
243
238
  """
244
- Returns a mapping of region names to sets of shape names available in these
245
- regions. Only shapes from `offers` are checked.
239
+ Returns availability domains where shapes are available as regions->shapes->availability_domains mapping.
240
+ Only shapes from `offers` are checked.
246
241
  """
247
-
248
242
  shape_names_per_region = {region: set() for region in regions}
249
243
  for offer in offers:
250
244
  if shapes_quota.is_within_region_quota(offer.instance.name, offer.region):
@@ -253,7 +247,7 @@ def get_shapes_availability(
253
247
  future_to_region_name = {}
254
248
  for region_name, shape_names in shape_names_per_region.items():
255
249
  future = executor.submit(
256
- check_availability_in_region,
250
+ check_availability_per_domain,
257
251
  shape_names,
258
252
  shapes_quota,
259
253
  regions[region_name],
@@ -263,29 +257,32 @@ def get_shapes_availability(
263
257
 
264
258
  result = {}
265
259
  for future in as_completed(future_to_region_name):
266
- region_name = future_to_region_name[future]
267
- result[region_name] = future.result()
260
+ domains_to_shape_names = future.result()
261
+ shape_names_to_domains = {}
262
+ for domain, shape_names in domains_to_shape_names.items():
263
+ for shape_name in shape_names:
264
+ shape_names_to_domains.setdefault(shape_name, []).append(domain)
265
+ result[future_to_region_name[future]] = shape_names_to_domains
268
266
 
269
267
  return result
270
268
 
271
269
 
272
- def choose_available_domain(
270
+ def get_available_domains(
273
271
  shape_name: str, shapes_quota: ShapesQuota, region: OCIRegionClient, compartment_id: str
274
- ) -> Optional[str]:
272
+ ) -> List[str]:
275
273
  """
276
- Returns the name of any availability domain within `region` in which
277
- `shape_name` is available. None if the shape is unavailable or not within
278
- `shapes_quota` in all domains.
274
+ Returns the names of all availability domains in `region` in which
275
+ `shape_name` is available and within `shapes_quota`.
279
276
  """
280
-
277
+ domains = []
281
278
  for domain in region.availability_domains:
282
279
  if shapes_quota.is_within_domain_quota(
283
280
  shape_name, domain.name
284
281
  ) and check_availability_in_domain(
285
282
  {shape_name}, domain.name, region.compute_client, compartment_id
286
283
  ):
287
- return domain.name
288
- return None
284
+ domains.append(domain.name)
285
+ return domains
289
286
 
290
287
 
291
288
  def get_instance_vnic(
@@ -1,9 +1,9 @@
1
1
  import io
2
2
  import json
3
3
  import time
4
- from contextlib import contextmanager
4
+ from contextlib import contextmanager, nullcontext
5
5
  from textwrap import dedent
6
- from typing import Any, Dict, Generator, List
6
+ from typing import Any, Dict, Generator, List, Optional
7
7
 
8
8
  import paramiko
9
9
  from gpuhunt import AcceleratorVendor, correct_gpu_memory_gib
@@ -17,6 +17,7 @@ from dstack._internal.core.models.instances import (
17
17
  Gpu,
18
18
  InstanceType,
19
19
  Resources,
20
+ SSHConnectionParams,
20
21
  )
21
22
  from dstack._internal.utils.gpu import (
22
23
  convert_amd_gpu_name,
@@ -262,35 +263,72 @@ def host_info_to_instance_type(host_info: Dict[str, Any]) -> InstanceType:
262
263
 
263
264
  @contextmanager
264
265
  def get_paramiko_connection(
265
- ssh_user: str, host: str, port: int, pkeys: List[paramiko.PKey]
266
+ ssh_user: str,
267
+ host: str,
268
+ port: int,
269
+ pkeys: List[paramiko.PKey],
270
+ proxy: Optional[SSHConnectionParams] = None,
271
+ proxy_pkeys: Optional[list[paramiko.PKey]] = None,
266
272
  ) -> Generator[paramiko.SSHClient, None, None]:
267
- with paramiko.SSHClient() as client:
268
- client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
269
- for pkey in pkeys:
270
- conn_url = f"{ssh_user}@{host}:{port}"
273
+ if proxy is not None:
274
+ if proxy_pkeys is None:
275
+ raise ProvisioningError("Missing proxy private keys")
276
+ proxy_ctx = get_paramiko_connection(
277
+ proxy.username, proxy.hostname, proxy.port, proxy_pkeys
278
+ )
279
+ else:
280
+ proxy_ctx = nullcontext()
281
+ conn_url = f"{ssh_user}@{host}:{port}"
282
+ with proxy_ctx as proxy_client, paramiko.SSHClient() as client:
283
+ proxy_channel: Optional[paramiko.Channel] = None
284
+ if proxy_client is not None:
271
285
  try:
272
- logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
273
- client.connect(
274
- username=ssh_user,
275
- hostname=host,
276
- port=port,
277
- pkey=pkey,
278
- look_for_keys=False,
279
- allow_agent=False,
280
- timeout=SSH_CONNECT_TIMEOUT,
286
+ proxy_channel = proxy_client.get_transport().open_channel(
287
+ "direct-tcpip", (host, port), ("", 0)
281
288
  )
282
- except paramiko.AuthenticationException:
283
- logger.debug(
284
- f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
285
- )
286
- continue # try next key
287
289
  except (paramiko.SSHException, OSError) as e:
288
- raise ProvisioningError(f"Connect failed: {e}") from e
289
- else:
290
+ raise ProvisioningError(f"Proxy channel failed: {e}") from e
291
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
292
+ for pkey in pkeys:
293
+ logger.debug("Try to connect to %s with key %s", conn_url, pkey.fingerprint)
294
+ connected = _paramiko_connect(client, ssh_user, host, port, pkey, proxy_channel)
295
+ if connected:
290
296
  yield client
291
297
  return
292
- else:
293
- keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
294
- raise ProvisioningError(
295
- f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
298
+ logger.debug(
299
+ f'Authentication failed to connect to "{conn_url}" and {pkey.fingerprint}'
296
300
  )
301
+ keys_fp = ", ".join(f"{pk.fingerprint!r}" for pk in pkeys)
302
+ raise ProvisioningError(
303
+ f"SSH connection to the {conn_url} with keys [{keys_fp}] was unsuccessful"
304
+ )
305
+
306
+
307
+ def _paramiko_connect(
308
+ client: paramiko.SSHClient,
309
+ user: str,
310
+ host: str,
311
+ port: int,
312
+ pkey: paramiko.PKey,
313
+ channel: Optional[paramiko.Channel] = None,
314
+ ) -> bool:
315
+ """
316
+ Returns `True` if connected, `False` if auth failed, and raises `ProvisioningError`
317
+ on other errors.
318
+ """
319
+ try:
320
+ client.connect(
321
+ username=user,
322
+ hostname=host,
323
+ port=port,
324
+ pkey=pkey,
325
+ look_for_keys=False,
326
+ allow_agent=False,
327
+ timeout=SSH_CONNECT_TIMEOUT,
328
+ sock=channel,
329
+ )
330
+ return True
331
+ except paramiko.AuthenticationException:
332
+ return False
333
+ except (paramiko.SSHException, OSError) as e:
334
+ raise ProvisioningError(f"Connect failed: {e}") from e
@@ -113,6 +113,7 @@ class RunpodCompute(Compute):
113
113
  bid_per_gpu=bid_per_gpu,
114
114
  network_volume_id=network_volume_id,
115
115
  volume_mount_path=volume_mount_path,
116
+ env={"RUNPOD_POD_USER": "0"},
116
117
  )
117
118
 
118
119
  instance_id = resp["id"]
@@ -11,6 +11,7 @@ class AzureConfigInfo(CoreModel):
11
11
  type: Literal["azure"] = "azure"
12
12
  tenant_id: str
13
13
  subscription_id: str
14
+ resource_group: Optional[str] = None
14
15
  locations: Optional[List[str]] = None
15
16
  vpc_ids: Optional[Dict[str, str]] = None
16
17
  public_ips: Optional[bool] = None
@@ -48,6 +49,7 @@ class AzureConfigInfoWithCredsPartial(CoreModel):
48
49
  creds: Optional[AnyAzureCreds]
49
50
  tenant_id: Optional[str]
50
51
  subscription_id: Optional[str]
52
+ resource_group: Optional[str]
51
53
  locations: Optional[List[str]]
52
54
  vpc_ids: Optional[Dict[str, str]]
53
55
  public_ips: Optional[bool]
@@ -63,4 +65,4 @@ class AzureConfigValues(CoreModel):
63
65
 
64
66
 
65
67
  class AzureStoredConfig(AzureConfigInfo):
66
- resource_group: str
68
+ resource_group: str = ""
@@ -10,7 +10,7 @@ from dstack._internal.core.models.common import CoreModel, Duration, RegistryAut
10
10
  from dstack._internal.core.models.envs import Env
11
11
  from dstack._internal.core.models.fleets import FleetConfiguration
12
12
  from dstack._internal.core.models.gateways import GatewayConfiguration
13
- from dstack._internal.core.models.profiles import ProfileParams
13
+ from dstack._internal.core.models.profiles import ProfileParams, parse_off_duration
14
14
  from dstack._internal.core.models.repos.base import Repo
15
15
  from dstack._internal.core.models.repos.virtual import VirtualRepo
16
16
  from dstack._internal.core.models.resources import Range, ResourcesSpec
@@ -212,6 +212,29 @@ class DevEnvironmentConfigurationParams(CoreModel):
212
212
  ide: Annotated[Literal["vscode"], Field(description="The IDE to run")]
213
213
  version: Annotated[Optional[str], Field(description="The version of the IDE")]
214
214
  init: Annotated[CommandsList, Field(description="The bash commands to run on startup")] = []
215
+ inactivity_duration: Annotated[
216
+ Optional[Union[Literal["off"], int, bool, str]],
217
+ Field(
218
+ description=(
219
+ "The maximum amount of time the dev environment can be inactive"
220
+ " (e.g., `2h`, `1d`, etc)."
221
+ " After it elapses, the dev environment is automatically stopped."
222
+ " Inactivity is defined as the absence of SSH connections to the"
223
+ " dev environment, including VS Code connections, `ssh <run name>`"
224
+ " shells, and attached `dstack apply` or `dstack attach` commands."
225
+ " Use `off` for unlimited duration. Defaults to `off`"
226
+ )
227
+ ),
228
+ ]
229
+
230
+ @validator("inactivity_duration", pre=True, allow_reuse=True)
231
+ def parse_inactivity_duration(
232
+ cls, v: Optional[Union[Literal["off"], int, bool, str]]
233
+ ) -> Optional[int]:
234
+ v = parse_off_duration(v)
235
+ if isinstance(v, int):
236
+ return v
237
+ return None
215
238
 
216
239
 
217
240
  class DevEnvironmentConfiguration(
@@ -39,6 +39,14 @@ class InstanceGroupPlacement(str, Enum):
39
39
  CLUSTER = "cluster"
40
40
 
41
41
 
42
+ class SSHProxyParams(CoreModel):
43
+ hostname: Annotated[str, Field(description="The IP address or domain of proxy host")]
44
+ port: Annotated[Optional[int], Field(description="The SSH port of proxy host")] = None
45
+ user: Annotated[str, Field(description="The user to log in with for proxy host")]
46
+ identity_file: Annotated[str, Field(description="The private key to use for proxy host")]
47
+ ssh_key: Optional[SSHKey] = None
48
+
49
+
42
50
  class SSHHostParams(CoreModel):
43
51
  hostname: Annotated[str, Field(description="The IP address or domain to connect to")]
44
52
  port: Annotated[
@@ -50,6 +58,9 @@ class SSHHostParams(CoreModel):
50
58
  identity_file: Annotated[
51
59
  Optional[str], Field(description="The private key to use for this host")
52
60
  ] = None
61
+ proxy_jump: Annotated[
62
+ Optional[SSHProxyParams], Field(description="The SSH proxy configuration for this host")
63
+ ] = None
53
64
  internal_ip: Annotated[
54
65
  Optional[str],
55
66
  Field(
@@ -61,6 +72,19 @@ class SSHHostParams(CoreModel):
61
72
  ] = None
62
73
  ssh_key: Optional[SSHKey] = None
63
74
 
75
+ blocks: Annotated[
76
+ Union[Literal["auto"], int],
77
+ Field(
78
+ description=(
79
+ "The amount of blocks to split the instance into, a number or `auto`."
80
+ " `auto` means as many as possible."
81
+ " The number of GPUs and CPUs must be divisible by the number of blocks."
82
+ " Defaults to `1`, i.e. do not split"
83
+ ),
84
+ ge=1,
85
+ ),
86
+ ] = 1
87
+
64
88
  @validator("internal_ip")
65
89
  def validate_internal_ip(cls, value):
66
90
  if value is None:
@@ -83,6 +107,9 @@ class SSHParams(CoreModel):
83
107
  Optional[str], Field(description="The private key to use for all hosts")
84
108
  ] = None
85
109
  ssh_key: Optional[SSHKey] = None
110
+ proxy_jump: Annotated[
111
+ Optional[SSHProxyParams], Field(description="The SSH proxy configuration for all hosts")
112
+ ] = None
86
113
  hosts: Annotated[
87
114
  List[Union[SSHHostParams, str]],
88
115
  Field(
@@ -142,6 +169,19 @@ class InstanceGroupParams(CoreModel):
142
169
  Field(description="The resources requirements"),
143
170
  ] = ResourcesSpec()
144
171
 
172
+ blocks: Annotated[
173
+ Union[Literal["auto"], int],
174
+ Field(
175
+ description=(
176
+ "The amount of blocks to split the instance into, a number or `auto`."
177
+ " `auto` means as many as possible."
178
+ " The number of GPUs and CPUs must be divisible by the number of blocks."
179
+ " Defaults to `1`, i.e. do not split"
180
+ ),
181
+ ge=1,
182
+ ),
183
+ ] = 1
184
+
145
185
  backends: Annotated[
146
186
  Optional[List[BackendType]],
147
187
  Field(description="The backends to consider for provisioning (e.g., `[aws, gcp]`)"),
@@ -152,6 +192,12 @@ class InstanceGroupParams(CoreModel):
152
192
  description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
153
193
  ),
154
194
  ] = None
195
+ availability_zones: Annotated[
196
+ Optional[List[str]],
197
+ Field(
198
+ description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
199
+ ),
200
+ ] = None
155
201
  instance_types: Annotated[
156
202
  Optional[List[str]],
157
203
  Field(
@@ -92,6 +92,8 @@ class RemoteConnectionInfo(CoreModel):
92
92
  port: int
93
93
  ssh_user: str
94
94
  ssh_keys: List[SSHKey]
95
+ ssh_proxy: Optional[SSHConnectionParams] = None
96
+ ssh_proxy_keys: Optional[list[SSHKey]] = None
95
97
  env: Env = Env()
96
98
 
97
99
 
@@ -101,7 +103,6 @@ class InstanceConfiguration(CoreModel):
101
103
  user: str # dstack user name
102
104
  ssh_keys: List[SSHKey]
103
105
  instance_id: Optional[str] = None
104
- availability_zone: Optional[str] = None
105
106
  placement_group_name: Optional[str] = None
106
107
  reservation: Optional[str] = None
107
108
  volumes: Optional[List[Volume]] = None
@@ -140,7 +141,10 @@ class InstanceOffer(CoreModel):
140
141
 
141
142
  class InstanceOfferWithAvailability(InstanceOffer):
142
143
  availability: InstanceAvailability
144
+ availability_zones: Optional[List[str]] = None
143
145
  instance_runtime: InstanceRuntime = InstanceRuntime.SHIM
146
+ blocks: int = 1
147
+ total_blocks: int = 1
144
148
 
145
149
 
146
150
  class InstanceStatus(str, Enum):
@@ -25,14 +25,17 @@ class Instance(CoreModel):
25
25
  fleet_name: Optional[str] = None
26
26
  instance_num: int
27
27
  pool_name: Optional[str] = None
28
- job_name: Optional[str] = None
28
+ job_name: Optional[str] = None # deprecated, always None (instance can have more than one job)
29
29
  hostname: Optional[str] = None
30
30
  status: InstanceStatus
31
31
  unreachable: bool = False
32
32
  termination_reason: Optional[str] = None
33
33
  created: datetime.datetime
34
34
  region: Optional[str] = None
35
+ availability_zone: Optional[str] = None
35
36
  price: Optional[float] = None
37
+ total_blocks: Optional[int] = None
38
+ busy_blocks: int = 0
36
39
 
37
40
 
38
41
  class PoolInstances(CoreModel):
@@ -40,15 +40,15 @@ def parse_duration(v: Optional[Union[int, str]]) -> Optional[int]:
40
40
  return Duration.parse(v)
41
41
 
42
42
 
43
- def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
43
+ def parse_max_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
44
44
  return parse_off_duration(v)
45
45
 
46
46
 
47
- def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
47
+ def parse_stop_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
48
48
  return parse_off_duration(v)
49
49
 
50
50
 
51
- def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int, bool]]:
51
+ def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[str, int]]:
52
52
  if v == "off" or v is False:
53
53
  return "off"
54
54
  if v is True:
@@ -123,6 +123,12 @@ class ProfileParams(CoreModel):
123
123
  description="The regions to consider for provisioning (e.g., `[eu-west-1, us-west4, westeurope]`)"
124
124
  ),
125
125
  ]
126
+ availability_zones: Annotated[
127
+ Optional[List[str]],
128
+ Field(
129
+ description="The availability zones to consider for provisioning (e.g., `[eu-west-1a, us-west4-a]`)"
130
+ ),
131
+ ] = None
126
132
  instance_types: Annotated[
127
133
  Optional[List[str]],
128
134
  Field(
@@ -162,7 +168,7 @@ class ProfileParams(CoreModel):
162
168
  Optional[Union[Literal["off"], str, int, bool]],
163
169
  Field(
164
170
  description=(
165
- "The maximum duration of a run gracefull stopping."
171
+ "The maximum duration of a run graceful stopping."
166
172
  " After it elapses, the run is automatically forced stopped."
167
173
  " This includes force detaching volumes used by the run."
168
174
  " Use `off` for unlimited duration. Defaults to `5m`"
@@ -27,6 +27,7 @@ from dstack._internal.core.models.profiles import (
27
27
  from dstack._internal.core.models.repos import AnyRunRepoData
28
28
  from dstack._internal.core.models.resources import Memory, ResourcesSpec
29
29
  from dstack._internal.core.models.unix import UnixUser
30
+ from dstack._internal.core.models.volumes import MountPoint
30
31
  from dstack._internal.utils import common as common_utils
31
32
  from dstack._internal.utils.common import format_pretty_duration
32
33
 
@@ -112,6 +113,7 @@ class JobTerminationReason(str, Enum):
112
113
  DONE_BY_RUNNER = "done_by_runner"
113
114
  ABORTED_BY_USER = "aborted_by_user"
114
115
  TERMINATED_BY_SERVER = "terminated_by_server"
116
+ INACTIVITY_DURATION_EXCEEDED = "inactivity_duration_exceeded"
115
117
  # Set by the runner
116
118
  CONTAINER_EXITED_WITH_ERROR = "container_exited_with_error"
117
119
  PORTS_BINDING_FAILED = "ports_binding_failed"
@@ -132,6 +134,7 @@ class JobTerminationReason(str, Enum):
132
134
  self.DONE_BY_RUNNER: JobStatus.DONE,
133
135
  self.ABORTED_BY_USER: JobStatus.ABORTED,
134
136
  self.TERMINATED_BY_SERVER: JobStatus.TERMINATED,
137
+ self.INACTIVITY_DURATION_EXCEEDED: JobStatus.TERMINATED,
135
138
  self.CONTAINER_EXITED_WITH_ERROR: JobStatus.FAILED,
136
139
  self.PORTS_BINDING_FAILED: JobStatus.FAILED,
137
140
  self.CREATING_CONTAINER_ERROR: JobStatus.FAILED,
@@ -147,9 +150,9 @@ class JobTerminationReason(str, Enum):
147
150
  class Requirements(CoreModel):
148
151
  # TODO: Make requirements' fields required
149
152
  resources: ResourcesSpec
150
- max_price: Optional[float]
151
- spot: Optional[bool]
152
- reservation: Optional[str]
153
+ max_price: Optional[float] = None
154
+ spot: Optional[bool] = None
155
+ reservation: Optional[str] = None
153
156
 
154
157
  def pretty_format(self, resources_only: bool = False):
155
158
  res = self.resources.pretty_format()
@@ -190,6 +193,7 @@ class JobSpec(CoreModel):
190
193
  registry_auth: Optional[RegistryAuth]
191
194
  requirements: Requirements
192
195
  retry: Optional[Retry]
196
+ volumes: Optional[List[MountPoint]] = None
193
197
  # For backward compatibility with 0.18.x when retry_policy was required.
194
198
  # TODO: remove in 0.19
195
199
  retry_policy: ProfileRetryPolicy = ProfileRetryPolicy(retry=False)
@@ -231,6 +235,17 @@ class JobProvisioningData(CoreModel):
231
235
 
232
236
 
233
237
  class JobRuntimeData(CoreModel):
238
+ """
239
+ Holds various information only available after the job is submitted, such as:
240
+ * offer (depends on the instance)
241
+ * volumes used by the job
242
+ * resource constraints for container (depend on the instance)
243
+ * port mapping (reported by the shim only after the container is started)
244
+
245
+ Some fields are mutable, for example, `ports` only available when the shim starts
246
+ the container.
247
+ """
248
+
234
249
  network_mode: NetworkMode
235
250
  # GPU, CPU, memory resource shares. None means all available (no limit)
236
251
  gpu: Optional[int] = None
@@ -240,6 +255,10 @@ class JobRuntimeData(CoreModel):
240
255
  # None if data is not yet available (on vm-based backends and ssh instances)
241
256
  # or not applicable (container-based backends)
242
257
  ports: Optional[dict[int, int]] = None
258
+ # List of volumes used by the job
259
+ volume_names: Optional[list[str]] = None # None for backward compalibility
260
+ # Virtual shared offer
261
+ offer: Optional[InstanceOfferWithAvailability] = None # None for backward compalibility
243
262
 
244
263
 
245
264
  class ClusterInfo(CoreModel):
@@ -254,6 +273,7 @@ class JobSubmission(CoreModel):
254
273
  submitted_at: datetime
255
274
  last_processed_at: datetime
256
275
  finished_at: Optional[datetime]
276
+ inactivity_secs: Optional[int]
257
277
  status: JobStatus
258
278
  termination_reason: Optional[JobTerminationReason]
259
279
  termination_reason_message: Optional[str]
@@ -32,6 +32,9 @@ class VolumeConfiguration(CoreModel):
32
32
  name: Annotated[Optional[str], Field(description="The volume name")] = None
33
33
  backend: Annotated[BackendType, Field(description="The volume backend")]
34
34
  region: Annotated[str, Field(description="The volume region")]
35
+ availability_zone: Annotated[
36
+ Optional[str], Field(description="The volume availability zone")
37
+ ] = None
35
38
  size: Annotated[
36
39
  Optional[Memory],
37
40
  Field(description="The volume size. Must be specified when creating new volumes"),
@@ -68,6 +71,18 @@ class VolumeAttachmentData(CoreModel):
68
71
  device_name: Optional[str] = None
69
72
 
70
73
 
74
+ class VolumeInstance(CoreModel):
75
+ name: str
76
+ fleet_name: Optional[str] = None
77
+ instance_num: int
78
+ instance_id: Optional[str] = None
79
+
80
+
81
+ class VolumeAttachment(CoreModel):
82
+ instance: VolumeInstance
83
+ attachment_data: Optional[VolumeAttachmentData] = None
84
+
85
+
71
86
  class Volume(CoreModel):
72
87
  id: uuid.UUID
73
88
  name: str
@@ -83,8 +98,19 @@ class Volume(CoreModel):
83
98
  deleted: bool
84
99
  volume_id: Optional[str] = None # id of the volume in the cloud
85
100
  provisioning_data: Optional[VolumeProvisioningData] = None
101
+ attachments: Optional[List[VolumeAttachment]] = None
102
+ # attachment_data is deprecated in favor of attachments.
103
+ # It's only set for volumes that were attached before attachments.
86
104
  attachment_data: Optional[VolumeAttachmentData] = None
87
105
 
106
+ def get_attachment_data_for_instance(self, instance_id: str) -> Optional[VolumeAttachmentData]:
107
+ if self.attachments is not None:
108
+ for attachment in self.attachments:
109
+ if attachment.instance.instance_id == instance_id:
110
+ return attachment.attachment_data
111
+ # volume was attached before attachments were introduced
112
+ return self.attachment_data
113
+
88
114
 
89
115
  class VolumePlan(CoreModel):
90
116
  project_name: str