dstack 0.19.31__py3-none-any.whl → 0.19.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

@@ -42,6 +42,26 @@ INSTANCE_TYPE_SPECS = {
42
42
  "cpu_frequency": 2000000000,
43
43
  "cpu_manufacturer": "Intel",
44
44
  },
45
+ "2x MI300X 26x Xeon Platinum 8470": {
46
+ "cpu_model": "Xeon Platinum 8470",
47
+ "cpu_frequency": 2000000000,
48
+ "cpu_manufacturer": "Intel",
49
+ },
50
+ "2x MI300X 26x Xeon Platinum 8462Y+": {
51
+ "cpu_model": "Xeon Platinum 8462Y+",
52
+ "cpu_frequency": 2800000000,
53
+ "cpu_manufacturer": "Intel",
54
+ },
55
+ "4x MI300X 52x Xeon Platinum 8462Y": {
56
+ "cpu_model": "Xeon Platinum 8470",
57
+ "cpu_frequency": 2000000000,
58
+ "cpu_manufacturer": "Intel",
59
+ },
60
+ "4x MI300X 52x Xeon Platinum 8462Y+": {
61
+ "cpu_model": "Xeon Platinum 8462Y+",
62
+ "cpu_frequency": 2800000000,
63
+ "cpu_manufacturer": "Intel",
64
+ },
45
65
  }
46
66
 
47
67
 
@@ -2,6 +2,7 @@ import subprocess
2
2
  import tempfile
3
3
  import threading
4
4
  import time
5
+ from enum import Enum
5
6
  from typing import List, Optional, Tuple
6
7
 
7
8
  from gpuhunt import KNOWN_NVIDIA_GPUS, AcceleratorVendor
@@ -62,9 +63,28 @@ JUMP_POD_SSH_PORT = 22
62
63
  NVIDIA_GPU_NAME_TO_GPU_INFO = {gpu.name: gpu for gpu in KNOWN_NVIDIA_GPUS}
63
64
  NVIDIA_GPU_NAMES = NVIDIA_GPU_NAME_TO_GPU_INFO.keys()
64
65
 
66
+ NVIDIA_GPU_RESOURCE = "nvidia.com/gpu"
67
+ NVIDIA_GPU_COUNT_LABEL = f"{NVIDIA_GPU_RESOURCE}.count"
68
+ NVIDIA_GPU_PRODUCT_LABEL = f"{NVIDIA_GPU_RESOURCE}.product"
69
+ NVIDIA_GPU_NODE_TAINT = NVIDIA_GPU_RESOURCE
70
+
71
+ # Taints we know and tolerate when creating our objects, e.g., the jump pod.
72
+ TOLERATED_NODE_TAINTS = (NVIDIA_GPU_NODE_TAINT,)
73
+
65
74
  DUMMY_REGION = "-"
66
75
 
67
76
 
77
+ class Operator(str, Enum):
78
+ EXISTS = "Exists"
79
+ IN = "In"
80
+
81
+
82
+ class TaintEffect(str, Enum):
83
+ NO_EXECUTE = "NoExecute"
84
+ NO_SCHEDULE = "NoSchedule"
85
+ PREFER_NO_SCHEDULE = "PreferNoSchedule"
86
+
87
+
68
88
  class KubernetesCompute(
69
89
  ComputeWithFilteredOffersCached,
70
90
  ComputeWithPrivilegedSupport,
@@ -181,6 +201,7 @@ class KubernetesCompute(
181
201
  resources_requests: dict[str, str] = {}
182
202
  resources_limits: dict[str, str] = {}
183
203
  node_affinity: Optional[client.V1NodeAffinity] = None
204
+ tolerations: list[client.V1Toleration] = []
184
205
  volumes_: list[client.V1Volume] = []
185
206
  volume_mounts: list[client.V1VolumeMount] = []
186
207
 
@@ -226,21 +247,28 @@ class KubernetesCompute(
226
247
  "Requesting %d GPU(s), node labels: %s", gpu_min, matching_gpu_label_values
227
248
  )
228
249
  # TODO: support other GPU vendors
229
- resources_requests["nvidia.com/gpu"] = str(gpu_min)
230
- resources_limits["nvidia.com/gpu"] = str(gpu_min)
250
+ resources_requests[NVIDIA_GPU_RESOURCE] = str(gpu_min)
251
+ resources_limits[NVIDIA_GPU_RESOURCE] = str(gpu_min)
231
252
  node_affinity = client.V1NodeAffinity(
232
253
  required_during_scheduling_ignored_during_execution=[
233
254
  client.V1NodeSelectorTerm(
234
255
  match_expressions=[
235
256
  client.V1NodeSelectorRequirement(
236
- key="nvidia.com/gpu.product",
237
- operator="In",
257
+ key=NVIDIA_GPU_PRODUCT_LABEL,
258
+ operator=Operator.IN,
238
259
  values=list(matching_gpu_label_values),
239
260
  ),
240
261
  ],
241
262
  ),
242
263
  ],
243
264
  )
265
+ # It should be NoSchedule, but we also add NoExecute toleration just in case.
266
+ for effect in [TaintEffect.NO_SCHEDULE, TaintEffect.NO_EXECUTE]:
267
+ tolerations.append(
268
+ client.V1Toleration(
269
+ key=NVIDIA_GPU_NODE_TAINT, operator=Operator.EXISTS, effect=effect
270
+ )
271
+ )
244
272
 
245
273
  if (memory_min := resources_spec.memory.min) is not None:
246
274
  resources_requests["memory"] = _render_memory(memory_min)
@@ -304,6 +332,7 @@ class KubernetesCompute(
304
332
  )
305
333
  ],
306
334
  affinity=node_affinity,
335
+ tolerations=tolerations,
307
336
  volumes=volumes_,
308
337
  ),
309
338
  )
@@ -527,8 +556,8 @@ def _get_gpus_from_node_labels(labels: dict[str, str]) -> tuple[list[Gpu], Optio
527
556
  # "A100" but a product name like "Tesla-T4" or "A100-SXM4-40GB".
528
557
  # Thus, we convert the product name to a known gpu name.
529
558
  # TODO: support other GPU vendors
530
- gpu_count = labels.get("nvidia.com/gpu.count")
531
- gpu_product = labels.get("nvidia.com/gpu.product")
559
+ gpu_count = labels.get(NVIDIA_GPU_COUNT_LABEL)
560
+ gpu_product = labels.get(NVIDIA_GPU_PRODUCT_LABEL)
532
561
  if gpu_count is None or gpu_product is None:
533
562
  return [], None
534
563
  gpu_count = int(gpu_count)
@@ -647,6 +676,39 @@ def _create_jump_pod_service(
647
676
  namespace=namespace,
648
677
  name=pod_name,
649
678
  )
679
+
680
+ node_list = call_api_method(api.list_node, client.V1NodeList)
681
+ nodes = get_value(node_list, ".items", list[client.V1Node], required=True)
682
+ # False if we found at least one node without any "hard" taint, that is, if we don't need to
683
+ # specify the toleration.
684
+ toleration_required = True
685
+ # (key, effect) pairs.
686
+ tolerated_taints: set[tuple[str, str]] = set()
687
+ for node in nodes:
688
+ # True if the node has at least one NoExecute or NoSchedule taint.
689
+ has_hard_taint = False
690
+ taints = get_value(node, ".spec.taints", list[client.V1Taint]) or []
691
+ for taint in taints:
692
+ effect = get_value(taint, ".effect", str, required=True)
693
+ # A "soft" taint, ignore.
694
+ if effect == TaintEffect.PREFER_NO_SCHEDULE:
695
+ continue
696
+ has_hard_taint = True
697
+ key = get_value(taint, ".key", str, required=True)
698
+ if key in TOLERATED_NODE_TAINTS:
699
+ tolerated_taints.add((key, effect))
700
+ if not has_hard_taint:
701
+ toleration_required = False
702
+ break
703
+ tolerations: list[client.V1Toleration] = []
704
+ if toleration_required:
705
+ for key, effect in tolerated_taints:
706
+ tolerations.append(
707
+ client.V1Toleration(key=key, operator=Operator.EXISTS, effect=effect)
708
+ )
709
+ if not tolerations:
710
+ logger.warning("No appropriate node found, the jump pod may never be scheduled")
711
+
650
712
  commands = _get_jump_pod_commands(authorized_keys=ssh_public_keys)
651
713
  pod = client.V1Pod(
652
714
  metadata=client.V1ObjectMeta(
@@ -667,7 +729,8 @@ def _create_jump_pod_service(
667
729
  )
668
730
  ],
669
731
  )
670
- ]
732
+ ],
733
+ tolerations=tolerations,
671
734
  ),
672
735
  )
673
736
  call_api_method(
@@ -19,6 +19,7 @@ from dstack._internal.core.backends.base.compute import (
19
19
  ComputeWithPrivilegedSupport,
20
20
  generate_unique_instance_name,
21
21
  get_user_data,
22
+ merge_tags,
22
23
  )
23
24
  from dstack._internal.core.backends.base.offers import get_catalog_offers, get_offers_disk_modifier
24
25
  from dstack._internal.core.backends.nebius import resources
@@ -150,6 +151,18 @@ class NebiusCompute(
150
151
  if backend_data.cluster is not None:
151
152
  cluster_id = backend_data.cluster.id
152
153
 
154
+ labels = {
155
+ "owner": "dstack",
156
+ "dstack_project": instance_config.project_name.lower(),
157
+ "dstack_name": instance_config.instance_name,
158
+ "dstack_user": instance_config.user.lower(),
159
+ }
160
+ labels = merge_tags(
161
+ base_tags=labels,
162
+ backend_tags=self.config.tags,
163
+ resource_tags=instance_config.tags,
164
+ )
165
+ labels = resources.filter_invalid_labels(labels)
153
166
  gpus = instance_offer.instance.resources.gpus
154
167
  create_disk_op = resources.create_disk(
155
168
  sdk=self._sdk,
@@ -159,6 +172,7 @@ class NebiusCompute(
159
172
  image_family="ubuntu24.04-cuda12"
160
173
  if gpus and gpus[0].name == "B200"
161
174
  else "ubuntu22.04-cuda12",
175
+ labels=labels,
162
176
  )
163
177
  create_instance_op = None
164
178
  try:
@@ -184,6 +198,7 @@ class NebiusCompute(
184
198
  disk_id=create_disk_op.resource_id,
185
199
  subnet_id=self._get_subnet_id(instance_offer.region),
186
200
  preemptible=instance_offer.instance.resources.spot,
201
+ labels=labels,
187
202
  )
188
203
  _wait_for_instance(self._sdk, create_instance_op)
189
204
  except BaseException:
@@ -3,6 +3,7 @@ import json
3
3
  from nebius.aio.service_error import RequestError
4
4
 
5
5
  from dstack._internal.core.backends.base.configurator import (
6
+ TAGS_MAX_NUM,
6
7
  BackendRecord,
7
8
  Configurator,
8
9
  raise_invalid_credentials_error,
@@ -18,6 +19,7 @@ from dstack._internal.core.backends.nebius.models import (
18
19
  NebiusServiceAccountCreds,
19
20
  NebiusStoredConfig,
20
21
  )
22
+ from dstack._internal.core.errors import BackendError, ServerClientError
21
23
  from dstack._internal.core.models.backends.base import BackendType
22
24
 
23
25
 
@@ -53,6 +55,19 @@ class NebiusConfigurator(
53
55
  f" some of the valid options: {sorted(valid_fabrics)}"
54
56
  ),
55
57
  )
58
+ self._check_config_tags(config)
59
+
60
+ def _check_config_tags(self, config: NebiusBackendConfigWithCreds):
61
+ if not config.tags:
62
+ return
63
+ if len(config.tags) > TAGS_MAX_NUM:
64
+ raise ServerClientError(
65
+ f"Maximum number of tags exceeded. Up to {TAGS_MAX_NUM} tags is allowed."
66
+ )
67
+ try:
68
+ resources.validate_labels(config.tags)
69
+ except BackendError as e:
70
+ raise ServerClientError(e.args[0])
56
71
 
57
72
  def create_backend(
58
73
  self, project_name: str, config: NebiusBackendConfigWithCreds
@@ -1,4 +1,6 @@
1
- from typing import Annotated, Literal, Optional, Union
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Annotated, Dict, Literal, Optional, Union
2
4
 
3
5
  from pydantic import Field, root_validator
4
6
 
@@ -27,16 +29,38 @@ class NebiusServiceAccountCreds(CoreModel):
27
29
  )
28
30
  ),
29
31
  ]
32
+ filename: Annotated[
33
+ Optional[str], Field(description="The path to the service account credentials file")
34
+ ] = None
30
35
 
31
36
 
32
37
  class NebiusServiceAccountFileCreds(CoreModel):
33
38
  type: Annotated[Literal["service_account"], Field(description="The type of credentials")] = (
34
39
  "service_account"
35
40
  )
36
- service_account_id: Annotated[str, Field(description="Service account ID")]
37
- public_key_id: Annotated[str, Field(description="ID of the service account public key")]
41
+ service_account_id: Annotated[
42
+ Optional[str],
43
+ Field(
44
+ description=(
45
+ "Service account ID. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
46
+ )
47
+ ),
48
+ ] = None
49
+ public_key_id: Annotated[
50
+ Optional[str],
51
+ Field(
52
+ description=(
53
+ "ID of the service account public key. Set automatically if `filename` is specified. When configuring via the UI, it must be specified explicitly"
54
+ )
55
+ ),
56
+ ] = None
38
57
  private_key_file: Annotated[
39
- Optional[str], Field(description=("Path to the service account private key"))
58
+ Optional[str],
59
+ Field(
60
+ description=(
61
+ "Path to the service account private key. Set automatically if `filename` or `private_key_content` is specified. When configuring via the UI, it must be specified explicitly"
62
+ )
63
+ ),
40
64
  ] = None
41
65
  private_key_content: Annotated[
42
66
  Optional[str],
@@ -44,13 +68,35 @@ class NebiusServiceAccountFileCreds(CoreModel):
44
68
  description=(
45
69
  "Content of the service account private key. When configuring via"
46
70
  " `server/config.yml`, it's automatically filled from `private_key_file`."
47
- " When configuring via UI, it has to be specified explicitly."
71
+ " When configuring via UI, it has to be specified explicitly"
48
72
  )
49
73
  ),
50
74
  ] = None
75
+ filename: Annotated[
76
+ Optional[str], Field(description="The path to the service account credentials file")
77
+ ] = None
51
78
 
52
79
  @root_validator
53
80
  def fill_data(cls, values):
81
+ if filename := values.get("filename"):
82
+ try:
83
+ with open(Path(filename).expanduser()) as f:
84
+ data = json.load(f)
85
+ from nebius.base.service_account.credentials_file import (
86
+ ServiceAccountCredentials,
87
+ )
88
+
89
+ credentials = ServiceAccountCredentials.from_json(data)
90
+ subject = credentials.subject_credentials
91
+ values["service_account_id"] = subject.sub
92
+ values["public_key_id"] = subject.kid
93
+ values["private_key_content"] = subject.private_key
94
+ except OSError:
95
+ raise ValueError(f"No such file {filename}")
96
+ except Exception as e:
97
+ raise ValueError(f"Failed to parse credentials file {filename}: {e}")
98
+ return values
99
+
54
100
  return fill_data(
55
101
  values, filename_field="private_key_file", data_field="private_key_content"
56
102
  )
@@ -95,6 +141,12 @@ class NebiusBackendConfig(CoreModel):
95
141
  )
96
142
  ),
97
143
  ] = None
144
+ tags: Annotated[
145
+ Optional[Dict[str, str]],
146
+ Field(
147
+ description="The tags (labels) that will be assigned to resources created by `dstack`"
148
+ ),
149
+ ] = None
98
150
 
99
151
 
100
152
  class NebiusBackendConfigWithCreds(NebiusBackendConfig):
@@ -1,11 +1,12 @@
1
1
  import logging
2
+ import re
2
3
  import time
3
4
  from collections import defaultdict
4
5
  from collections.abc import Container as ContainerT
5
6
  from collections.abc import Generator, Iterable, Sequence
6
7
  from contextlib import contextmanager
7
8
  from tempfile import NamedTemporaryFile
8
- from typing import Optional
9
+ from typing import Dict, Optional
9
10
 
10
11
  from nebius.aio.authorization.options import options_to_metadata
11
12
  from nebius.aio.operation import Operation as SDKOperation
@@ -249,13 +250,14 @@ def get_default_subnet(sdk: SDK, project_id: str) -> Subnet:
249
250
 
250
251
 
251
252
  def create_disk(
252
- sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str
253
+ sdk: SDK, name: str, project_id: str, size_mib: int, image_family: str, labels: Dict[str, str]
253
254
  ) -> SDKOperation[Operation]:
254
255
  client = DiskServiceClient(sdk)
255
256
  request = CreateDiskRequest(
256
257
  metadata=ResourceMetadata(
257
258
  name=name,
258
259
  parent_id=project_id,
260
+ labels=labels,
259
261
  ),
260
262
  spec=DiskSpec(
261
263
  size_mebibytes=size_mib,
@@ -288,12 +290,14 @@ def create_instance(
288
290
  disk_id: str,
289
291
  subnet_id: str,
290
292
  preemptible: bool,
293
+ labels: Dict[str, str],
291
294
  ) -> SDKOperation[Operation]:
292
295
  client = InstanceServiceClient(sdk)
293
296
  request = CreateInstanceRequest(
294
297
  metadata=ResourceMetadata(
295
298
  name=name,
296
299
  parent_id=project_id,
300
+ labels=labels,
297
301
  ),
298
302
  spec=InstanceSpec(
299
303
  cloud_init_user_data=user_data,
@@ -367,3 +371,42 @@ def delete_cluster(sdk: SDK, cluster_id: str) -> None:
367
371
  metadata=REQUEST_MD,
368
372
  )
369
373
  )
374
+
375
+
376
+ def filter_invalid_labels(labels: Dict[str, str]) -> Dict[str, str]:
377
+ filtered_labels = {}
378
+ for k, v in labels.items():
379
+ if not _is_valid_label(k, v):
380
+ logger.warning("Skipping invalid label '%s: %s'", k, v)
381
+ continue
382
+ filtered_labels[k] = v
383
+ return filtered_labels
384
+
385
+
386
+ def validate_labels(labels: Dict[str, str]):
387
+ for k, v in labels.items():
388
+ if not _is_valid_label(k, v):
389
+ raise BackendError("Invalid resource labels")
390
+
391
+
392
+ def _is_valid_label(key: str, value: str) -> bool:
393
+ # TODO: [Nebius] current validation logic reuses GCP's approach.
394
+ # There is no public information on Nebius labels restrictions.
395
+ return is_valid_resource_name(key) and is_valid_label_value(value)
396
+
397
+
398
+ MAX_RESOURCE_NAME_LEN = 63
399
+ NAME_PATTERN = re.compile(r"^[a-z][_\-a-z0-9]{0,62}$")
400
+ LABEL_VALUE_PATTERN = re.compile(r"^[_\-a-z0-9]{0,63}$")
401
+
402
+
403
+ def is_valid_resource_name(name: str) -> bool:
404
+ if len(name) < 1 or len(name) > MAX_RESOURCE_NAME_LEN:
405
+ return False
406
+ match = re.match(NAME_PATTERN, name)
407
+ return match is not None
408
+
409
+
410
+ def is_valid_label_value(value: str) -> bool:
411
+ match = re.match(LABEL_VALUE_PATTERN, value)
412
+ return match is not None
@@ -53,6 +53,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
53
53
  }
54
54
  if all(js.exit_status is None for js in job_submissions):
55
55
  job_submissions_excludes["exit_status"] = True
56
+ if all(js.status_message == "" for js in job_submissions):
57
+ job_submissions_excludes["status_message"] = True
58
+ if all(js.error is None for js in job_submissions):
59
+ job_submissions_excludes["error"] = True
56
60
  if all(js.deployment_num == 0 for js in job_submissions):
57
61
  job_submissions_excludes["deployment_num"] = True
58
62
  if all(not js.probes for js in job_submissions):
@@ -71,6 +75,10 @@ def get_apply_plan_excludes(plan: ApplyRunPlanInput) -> Optional[IncludeExcludeD
71
75
  }
72
76
  if latest_job_submission.exit_status is None:
73
77
  latest_job_submission_excludes["exit_status"] = True
78
+ if latest_job_submission.status_message == "":
79
+ latest_job_submission_excludes["status_message"] = True
80
+ if latest_job_submission.error is None:
81
+ latest_job_submission_excludes["error"] = True
74
82
  if latest_job_submission.deployment_num == 0:
75
83
  latest_job_submission_excludes["deployment_num"] = True
76
84
  if not latest_job_submission.probes:
@@ -80,14 +80,21 @@ def parse_stop_duration(
80
80
  def parse_off_duration(v: Optional[Union[int, str, bool]]) -> Optional[Union[Literal["off"], int]]:
81
81
  if v == "off" or v is False:
82
82
  return "off"
83
- if v is True:
83
+ if v is True or v is None:
84
84
  return None
85
- return parse_duration(v)
85
+ duration = parse_duration(v)
86
+ if duration < 0:
87
+ raise ValueError("Duration cannot be negative")
88
+ return duration
86
89
 
87
90
 
88
- def parse_idle_duration(v: Optional[Union[int, str]]) -> Optional[int]:
89
- if v == "off" or v == -1:
91
+ def parse_idle_duration(v: Optional[Union[int, str, bool]]) -> Optional[int]:
92
+ # Differs from `parse_off_duration` to accept negative durations as `off`
93
+ # for backward compatibility.
94
+ if v == "off" or v is False or v == -1:
90
95
  return -1
96
+ if v is True:
97
+ return None
91
98
  return parse_duration(v)
92
99
 
93
100
 
@@ -1,10 +1,11 @@
1
+ from collections import defaultdict
1
2
  from datetime import timedelta
2
3
  from typing import List
3
4
  from uuid import UUID
4
5
 
5
6
  from sqlalchemy import select, update
6
7
  from sqlalchemy.ext.asyncio import AsyncSession
7
- from sqlalchemy.orm import joinedload, load_only
8
+ from sqlalchemy.orm import joinedload, load_only, selectinload
8
9
 
9
10
  from dstack._internal.core.models.fleets import FleetSpec, FleetStatus
10
11
  from dstack._internal.core.models.instances import InstanceStatus
@@ -37,30 +38,68 @@ MIN_PROCESSING_INTERVAL = timedelta(seconds=30)
37
38
 
38
39
  @sentry_utils.instrument_background_task
39
40
  async def process_fleets():
40
- lock, lockset = get_locker(get_db().dialect_name).get_lockset(FleetModel.__tablename__)
41
+ fleet_lock, fleet_lockset = get_locker(get_db().dialect_name).get_lockset(
42
+ FleetModel.__tablename__
43
+ )
44
+ instance_lock, instance_lockset = get_locker(get_db().dialect_name).get_lockset(
45
+ InstanceModel.__tablename__
46
+ )
41
47
  async with get_session_ctx() as session:
42
- async with lock:
48
+ async with fleet_lock, instance_lock:
43
49
  res = await session.execute(
44
50
  select(FleetModel)
45
51
  .where(
46
52
  FleetModel.deleted == False,
47
- FleetModel.id.not_in(lockset),
53
+ FleetModel.id.not_in(fleet_lockset),
48
54
  FleetModel.last_processed_at
49
55
  < get_current_datetime() - MIN_PROCESSING_INTERVAL,
50
56
  )
51
- .options(load_only(FleetModel.id))
57
+ .options(
58
+ load_only(FleetModel.id, FleetModel.name),
59
+ selectinload(FleetModel.instances).load_only(InstanceModel.id),
60
+ )
52
61
  .order_by(FleetModel.last_processed_at.asc())
53
62
  .limit(BATCH_SIZE)
54
63
  .with_for_update(skip_locked=True, key_share=True)
55
64
  )
56
- fleet_models = list(res.scalars().all())
65
+ fleet_models = list(res.scalars().unique().all())
57
66
  fleet_ids = [fm.id for fm in fleet_models]
67
+ res = await session.execute(
68
+ select(InstanceModel)
69
+ .where(
70
+ InstanceModel.id.not_in(instance_lockset),
71
+ InstanceModel.fleet_id.in_(fleet_ids),
72
+ )
73
+ .options(load_only(InstanceModel.id, InstanceModel.fleet_id))
74
+ .order_by(InstanceModel.id)
75
+ .with_for_update(skip_locked=True, key_share=True)
76
+ )
77
+ instance_models = list(res.scalars().all())
78
+ fleet_id_to_locked_instances = defaultdict(list)
79
+ for instance_model in instance_models:
80
+ fleet_id_to_locked_instances[instance_model.fleet_id].append(instance_model)
81
+ # Process only fleets with all instances locked.
82
+ # Other fleets won't be processed but will still be locked to avoid new transaction.
83
+ # This should not be problematic as long as process_fleets is quick.
84
+ fleet_models_to_process = []
85
+ for fleet_model in fleet_models:
86
+ if len(fleet_model.instances) == len(fleet_id_to_locked_instances[fleet_model.id]):
87
+ fleet_models_to_process.append(fleet_model)
88
+ else:
89
+ logger.debug(
90
+ "Fleet %s processing will be skipped: some instance were not locked",
91
+ fleet_model.name,
92
+ )
58
93
  for fleet_id in fleet_ids:
59
- lockset.add(fleet_id)
94
+ fleet_lockset.add(fleet_id)
95
+ instance_ids = [im.id for im in instance_models]
96
+ for instance_id in instance_ids:
97
+ instance_lockset.add(instance_id)
60
98
  try:
61
- await _process_fleets(session=session, fleet_models=fleet_models)
99
+ await _process_fleets(session=session, fleet_models=fleet_models_to_process)
62
100
  finally:
63
- lockset.difference_update(fleet_ids)
101
+ fleet_lockset.difference_update(fleet_ids)
102
+ instance_lockset.difference_update(instance_ids)
64
103
 
65
104
 
66
105
  async def _process_fleets(session: AsyncSession, fleet_models: List[FleetModel]):
@@ -99,8 +138,8 @@ def _consolidate_fleet_state_with_spec(session: AsyncSession, fleet_model: Fleet
99
138
  return
100
139
  if not _is_fleet_ready_for_consolidation(fleet_model):
101
140
  return
102
- added_instances = _maintain_fleet_nodes_min(session, fleet_model, fleet_spec)
103
- if added_instances:
141
+ changed_instances = _maintain_fleet_nodes_in_min_max_range(session, fleet_model, fleet_spec)
142
+ if changed_instances:
104
143
  fleet_model.consolidation_attempt += 1
105
144
  else:
106
145
  # The fleet is already consolidated or consolidation is in progress.
@@ -138,28 +177,47 @@ def _get_consolidation_retry_delay(consolidation_attempt: int) -> timedelta:
138
177
  return _CONSOLIDATION_RETRY_DELAYS[-1]
139
178
 
140
179
 
141
- def _maintain_fleet_nodes_min(
180
+ def _maintain_fleet_nodes_in_min_max_range(
142
181
  session: AsyncSession,
143
182
  fleet_model: FleetModel,
144
183
  fleet_spec: FleetSpec,
145
184
  ) -> bool:
146
185
  """
147
- Ensures the fleet has at least `nodes.min` instances.
148
- Returns `True` if retried or added new instances and `False` otherwise.
186
+ Ensures the fleet has at least `nodes.min` and at most `nodes.max` instances.
187
+ Returns `True` if retried, added new instances, or terminated redundant instances and `False` otherwise.
149
188
  """
150
189
  assert fleet_spec.configuration.nodes is not None
151
190
  for instance in fleet_model.instances:
152
191
  # Delete terminated but not deleted instances since
153
192
  # they are going to be replaced with new pending instances.
154
193
  if instance.status == InstanceStatus.TERMINATED and not instance.deleted:
155
- # It's safe to modify instances without instance lock since
156
- # no other task modifies already terminated instances.
157
194
  instance.deleted = True
158
195
  instance.deleted_at = get_current_datetime()
159
196
  active_instances = [i for i in fleet_model.instances if not i.deleted]
160
197
  active_instances_num = len(active_instances)
161
198
  if active_instances_num >= fleet_spec.configuration.nodes.min:
162
- return False
199
+ if (
200
+ fleet_spec.configuration.nodes.max is None
201
+ or active_instances_num <= fleet_spec.configuration.nodes.max
202
+ ):
203
+ return False
204
+ # Fleet has more instances than allowed by nodes.max.
205
+ # This is possible due to race conditions (e.g. provisioning jobs in a fleet concurrently)
206
+ # or if nodes.max is updated.
207
+ nodes_redundant = active_instances_num - fleet_spec.configuration.nodes.max
208
+ for instance in fleet_model.instances:
209
+ if nodes_redundant == 0:
210
+ break
211
+ if instance.status in [InstanceStatus.IDLE]:
212
+ instance.status = InstanceStatus.TERMINATING
213
+ instance.termination_reason = "Fleet has too many instances"
214
+ nodes_redundant -= 1
215
+ logger.info(
216
+ "Terminating instance %s: %s",
217
+ instance.name,
218
+ instance.termination_reason,
219
+ )
220
+ return True
163
221
  nodes_missing = fleet_spec.configuration.nodes.min - active_instances_num
164
222
  for i in range(nodes_missing):
165
223
  instance_model = create_fleet_instance_model(
@@ -259,9 +259,7 @@ async def _add_remote(instance: InstanceModel) -> None:
259
259
  if instance.status == InstanceStatus.PENDING:
260
260
  instance.status = InstanceStatus.PROVISIONING
261
261
 
262
- retry_duration_deadline = instance.created_at.replace(
263
- tzinfo=datetime.timezone.utc
264
- ) + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
262
+ retry_duration_deadline = instance.created_at + timedelta(seconds=PROVISIONING_TIMEOUT_SECONDS)
265
263
  if retry_duration_deadline < get_current_datetime():
266
264
  instance.status = InstanceStatus.TERMINATED
267
265
  instance.termination_reason = "Provisioning timeout expired"
@@ -256,8 +256,8 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
256
256
  for replica_num, job_models in group_jobs_by_replica_latest(run_model.jobs):
257
257
  replica_statuses: Set[RunStatus] = set()
258
258
  replica_needs_retry = False
259
-
260
259
  replica_active = True
260
+ jobs_done_num = 0
261
261
  for job_model in job_models:
262
262
  job = find_job(run.jobs, job_model.replica_num, job_model.job_num)
263
263
  if (
@@ -272,8 +272,7 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
272
272
  ):
273
273
  # the job is done or going to be done
274
274
  replica_statuses.add(RunStatus.DONE)
275
- # for some reason the replica is done, it's not active
276
- replica_active = False
275
+ jobs_done_num += 1
277
276
  elif job_model.termination_reason == JobTerminationReason.SCALED_DOWN:
278
277
  # the job was scaled down
279
278
  replica_active = False
@@ -313,26 +312,14 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
313
312
  if not replica_needs_retry or retry_single_job:
314
313
  run_statuses.update(replica_statuses)
315
314
 
316
- if replica_active:
317
- # submitted_at = replica created
318
- replicas_info.append(
319
- autoscalers.ReplicaInfo(
320
- active=True,
321
- timestamp=min(job.submitted_at for job in job_models).replace(
322
- tzinfo=datetime.timezone.utc
323
- ),
324
- )
325
- )
326
- else:
327
- # last_processed_at = replica scaled down
328
- replicas_info.append(
329
- autoscalers.ReplicaInfo(
330
- active=False,
331
- timestamp=max(job.last_processed_at for job in job_models).replace(
332
- tzinfo=datetime.timezone.utc
333
- ),
334
- )
335
- )
315
+ if jobs_done_num == len(job_models):
316
+ # Consider replica inactive if all its jobs are done for some reason.
317
+ # If only some jobs are done, replica is considered active to avoid
318
+ # provisioning new replicas for partially done multi-node tasks.
319
+ replica_active = False
320
+
321
+ replica_info = _get_replica_info(job_models, replica_active)
322
+ replicas_info.append(replica_info)
336
323
 
337
324
  termination_reason: Optional[RunTerminationReason] = None
338
325
  if RunStatus.FAILED in run_statuses:
@@ -410,6 +397,23 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
410
397
  run_model.resubmission_attempt += 1
411
398
 
412
399
 
400
+ def _get_replica_info(
401
+ replica_job_models: list[JobModel],
402
+ replica_active: bool,
403
+ ) -> autoscalers.ReplicaInfo:
404
+ if replica_active:
405
+ # submitted_at = replica created
406
+ return autoscalers.ReplicaInfo(
407
+ active=True,
408
+ timestamp=min(job.submitted_at for job in replica_job_models),
409
+ )
410
+ # last_processed_at = replica scaled down
411
+ return autoscalers.ReplicaInfo(
412
+ active=False,
413
+ timestamp=max(job.last_processed_at for job in replica_job_models),
414
+ )
415
+
416
+
413
417
  async def _handle_run_replicas(
414
418
  session: AsyncSession,
415
419
  run_model: RunModel,
@@ -260,7 +260,6 @@ async def _process_submitted_job(session: AsyncSession, job_model: JobModel):
260
260
 
261
261
  instance_filters = [
262
262
  InstanceModel.deleted == False,
263
- InstanceModel.total_blocks > InstanceModel.busy_blocks,
264
263
  InstanceModel.id.not_in(detaching_instances_ids),
265
264
  ]
266
265
 
@@ -514,9 +513,6 @@ async def _find_optimal_fleet_with_offers(
514
513
  )
515
514
  return run_model.fleet, fleet_instances_with_pool_offers
516
515
 
517
- if len(fleet_models) == 0:
518
- return None, []
519
-
520
516
  nodes_required_num = _get_nodes_required_num_for_run(run_spec)
521
517
  # The current strategy is first to consider fleets that can accommodate
522
518
  # the run without additional provisioning and choose the one with the cheapest pool offer.
@@ -534,6 +530,7 @@ async def _find_optimal_fleet_with_offers(
534
530
  ]
535
531
  ] = []
536
532
  for candidate_fleet_model in fleet_models:
533
+ candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
537
534
  fleet_instances_with_pool_offers = _get_fleet_instances_with_pool_offers(
538
535
  fleet_model=candidate_fleet_model,
539
536
  run_spec=run_spec,
@@ -541,24 +538,21 @@ async def _find_optimal_fleet_with_offers(
541
538
  master_job_provisioning_data=master_job_provisioning_data,
542
539
  volumes=volumes,
543
540
  )
544
- fleet_has_available_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
541
+ fleet_has_pool_capacity = nodes_required_num <= len(fleet_instances_with_pool_offers)
545
542
  fleet_cheapest_pool_offer = math.inf
546
543
  if len(fleet_instances_with_pool_offers) > 0:
547
544
  fleet_cheapest_pool_offer = fleet_instances_with_pool_offers[0][1].price
548
545
 
549
- candidate_fleet = fleet_model_to_fleet(candidate_fleet_model)
550
- profile = None
551
- requirements = None
552
546
  try:
547
+ _check_can_create_new_instance_in_fleet(candidate_fleet)
553
548
  profile, requirements = _get_run_profile_and_requirements_in_fleet(
554
549
  job=job,
555
550
  run_spec=run_spec,
556
551
  fleet=candidate_fleet,
557
552
  )
558
553
  except ValueError:
559
- pass
560
- fleet_backend_offers = []
561
- if profile is not None and requirements is not None:
554
+ fleet_backend_offers = []
555
+ else:
562
556
  multinode = (
563
557
  candidate_fleet.spec.configuration.placement == InstanceGroupPlacement.CLUSTER
564
558
  or job.job_spec.jobs_per_replica > 1
@@ -579,8 +573,12 @@ async def _find_optimal_fleet_with_offers(
579
573
  if len(fleet_backend_offers) > 0:
580
574
  fleet_cheapest_backend_offer = fleet_backend_offers[0][1].price
581
575
 
576
+ if not _run_can_fit_into_fleet(run_spec, candidate_fleet):
577
+ logger.debug("Skipping fleet %s from consideration: run cannot fit into fleet")
578
+ continue
579
+
582
580
  fleet_priority = (
583
- not fleet_has_available_capacity,
581
+ not fleet_has_pool_capacity,
584
582
  fleet_cheapest_pool_offer,
585
583
  fleet_cheapest_backend_offer,
586
584
  )
@@ -593,10 +591,13 @@ async def _find_optimal_fleet_with_offers(
593
591
  fleet_priority,
594
592
  )
595
593
  )
594
+ if len(candidate_fleets_with_offers) == 0:
595
+ return None, []
596
596
  if run_spec.merged_profile.fleets is None and all(
597
597
  t[2] == 0 and t[3] == 0 for t in candidate_fleets_with_offers
598
598
  ):
599
- # If fleets are not specified and no fleets have available pool or backend offers, create a new fleet.
599
+ # If fleets are not specified and no fleets have available pool
600
+ # or backend offers, create a new fleet.
600
601
  # This is for compatibility with non-fleet-first UX when runs created new fleets
601
602
  # if there are no instances to reuse.
602
603
  return None, []
@@ -616,6 +617,39 @@ def _get_nodes_required_num_for_run(run_spec: RunSpec) -> int:
616
617
  return nodes_required_num
617
618
 
618
619
 
620
+ def _run_can_fit_into_fleet(run_spec: RunSpec, fleet: Fleet) -> bool:
621
+ """
622
+ Returns `False` if the run cannot fit into fleet for sure.
623
+ This is helpful heuristic to avoid even considering fleets too small for a run.
624
+ A run may not fit even if this function returns `True`.
625
+ This will lead to some jobs failing due to exceeding `nodes.max`
626
+ or more than `nodes.max` instances being provisioned
627
+ and eventually removed by the fleet consolidation logic.
628
+ """
629
+ # No check for cloud fleets with blocks > 1 since we don't know
630
+ # how many jobs such fleets can accommodate.
631
+ nodes_required_num = _get_nodes_required_num_for_run(run_spec)
632
+ if (
633
+ fleet.spec.configuration.nodes is not None
634
+ and fleet.spec.configuration.blocks == 1
635
+ and fleet.spec.configuration.nodes.max is not None
636
+ ):
637
+ busy_instances = [i for i in fleet.instances if i.busy_blocks > 0]
638
+ fleet_available_capacity = fleet.spec.configuration.nodes.max - len(busy_instances)
639
+ if fleet_available_capacity < nodes_required_num:
640
+ return False
641
+ elif fleet.spec.configuration.ssh_config is not None:
642
+ # Currently assume that each idle block can run a job.
643
+ # TODO: Take resources / eligible offers into account.
644
+ total_idle_blocks = 0
645
+ for instance in fleet.instances:
646
+ total_blocks = instance.total_blocks or 1
647
+ total_idle_blocks += total_blocks - instance.busy_blocks
648
+ if total_idle_blocks < nodes_required_num:
649
+ return False
650
+ return True
651
+
652
+
619
653
  def _get_fleet_instances_with_pool_offers(
620
654
  fleet_model: FleetModel,
621
655
  run_spec: RunSpec,
@@ -713,6 +747,7 @@ async def _run_job_on_new_instance(
713
747
  if fleet_model is not None:
714
748
  fleet = fleet_model_to_fleet(fleet_model)
715
749
  try:
750
+ _check_can_create_new_instance_in_fleet(fleet)
716
751
  profile, requirements = _get_run_profile_and_requirements_in_fleet(
717
752
  job=job,
718
753
  run_spec=run.run_spec,
@@ -787,8 +822,6 @@ def _get_run_profile_and_requirements_in_fleet(
787
822
  run_spec: RunSpec,
788
823
  fleet: Fleet,
789
824
  ) -> tuple[Profile, Requirements]:
790
- if not _check_can_create_new_instance_in_fleet(fleet):
791
- raise ValueError("Cannot fit new instance into fleet")
792
825
  profile = combine_fleet_and_run_profiles(fleet.spec.merged_profile, run_spec.merged_profile)
793
826
  if profile is None:
794
827
  raise ValueError("Cannot combine fleet profile")
@@ -801,13 +834,23 @@ def _get_run_profile_and_requirements_in_fleet(
801
834
  return profile, requirements
802
835
 
803
836
 
804
- def _check_can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
837
+ def _check_can_create_new_instance_in_fleet(fleet: Fleet):
838
+ if not _can_create_new_instance_in_fleet(fleet):
839
+ raise ValueError("Cannot fit new instance into fleet")
840
+
841
+
842
+ def _can_create_new_instance_in_fleet(fleet: Fleet) -> bool:
805
843
  if fleet.spec.configuration.ssh_config is not None:
806
844
  return False
807
- # TODO: Respect nodes.max
808
- # Ensure concurrent provisioning does not violate nodes.max
809
- # E.g. lock fleet and split instance model creation
810
- # and instance provisioning into separate transactions.
845
+ active_instances = [i for i in fleet.instances if i.status.is_active()]
846
+ # nodes.max is a soft limit that can be exceeded when provisioning concurrently.
847
+ # The fleet consolidation logic will remove redundant nodes eventually.
848
+ if (
849
+ fleet.spec.configuration.nodes is not None
850
+ and fleet.spec.configuration.nodes.max is not None
851
+ and len(active_instances) >= fleet.spec.configuration.nodes.max
852
+ ):
853
+ return False
811
854
  return True
812
855
 
813
856
 
dstack/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.19.31"
1
+ __version__ = "0.19.32"
2
2
  __is_release__ = True
3
3
  base_image = "0.11rc2"
4
4
  base_image_ubuntu_version = "22.04"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dstack
3
- Version: 0.19.31
3
+ Version: 0.19.32
4
4
  Summary: dstack is an open-source orchestration engine for running AI workloads on any cloud or on-premises.
5
5
  Project-URL: Homepage, https://dstack.ai
6
6
  Project-URL: Source, https://github.com/dstackai/dstack
@@ -73,7 +73,7 @@ Requires-Dist: grpcio>=1.50; extra == 'all'
73
73
  Requires-Dist: httpx; extra == 'all'
74
74
  Requires-Dist: jinja2; extra == 'all'
75
75
  Requires-Dist: kubernetes; extra == 'all'
76
- Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'all'
76
+ Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'all'
77
77
  Requires-Dist: oci>=2.150.0; extra == 'all'
78
78
  Requires-Dist: prometheus-client; extra == 'all'
79
79
  Requires-Dist: pyopenssl>=23.2.0; extra == 'all'
@@ -259,7 +259,7 @@ Requires-Dist: fastapi; extra == 'nebius'
259
259
  Requires-Dist: grpcio>=1.50; extra == 'nebius'
260
260
  Requires-Dist: httpx; extra == 'nebius'
261
261
  Requires-Dist: jinja2; extra == 'nebius'
262
- Requires-Dist: nebius<0.3,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
262
+ Requires-Dist: nebius<=0.2.72,>=0.2.40; (python_version >= '3.10') and extra == 'nebius'
263
263
  Requires-Dist: prometheus-client; extra == 'nebius'
264
264
  Requires-Dist: python-dxf==12.1.0; extra == 'nebius'
265
265
  Requires-Dist: python-json-logger>=3.1.0; extra == 'nebius'
@@ -340,15 +340,13 @@ It streamlines development, training, and inference, and is compatible with any
340
340
  `dstack` supports `NVIDIA`, `AMD`, `Google TPU`, `Intel Gaudi`, and `Tenstorrent` accelerators out of the box.
341
341
 
342
342
  ## Latest news ✨
343
- - [2025/09] [dstack 0.19.27: Offers UI, Digital Ocean and AMD Developer Cloud](https://github.com/dstackai/dstack/releases/tag/0.19.27)
344
- - [2025/08] [dstack 0.19.26: Repos – explicit repo configuration via YAML](https://github.com/dstackai/dstack/releases/tag/0.19.26)
345
- - [2025/08] [dstack 0.19.25: `dstack offer` CLI command](https://github.com/dstackai/dstack/releases/tag/0.19.25)
346
- - [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy, Secrets UI](https://github.com/dstackai/dstack/releases/tag/0.19.22)
343
+ - [2025/10] [dstack 0.19.31: Kubernetes, GCP A4 spot](https://github.com/dstackai/dstack/releases/tag/0.19.31)
344
+ - [2025/08] [dstack 0.19.26: Repos](https://github.com/dstackai/dstack/releases/tag/0.19.26)
345
+ - [2025/08] [dstack 0.19.22: Service probes, GPU health-checks, Tenstorrent Galaxy](https://github.com/dstackai/dstack/releases/tag/0.19.22)
347
346
  - [2025/07] [dstack 0.19.21: Scheduled tasks](https://github.com/dstackai/dstack/releases/tag/0.19.21)
348
347
  - [2025/07] [dstack 0.19.17: Secrets, Files, Rolling deployment](https://github.com/dstackai/dstack/releases/tag/0.19.17)
349
- - [2025/06] [dstack 0.19.16: Docker in Docker, CloudRift](https://github.com/dstackai/dstack/releases/tag/0.19.16)
350
- - [2025/06] [dstack 0.19.13: InfiniBand support in default images](https://github.com/dstackai/dstack/releases/tag/0.19.13)
351
- - [2025/06] [dstack 0.19.12: Simplified use of MPI](https://github.com/dstackai/dstack/releases/tag/0.19.12)
348
+ - [2025/06] [dstack 0.19.16: Docker in Docker](https://github.com/dstackai/dstack/releases/tag/0.19.16)
349
+ - [2025/06] [dstack 0.19.13: Default images with InfiniBand support](https://github.com/dstackai/dstack/releases/tag/0.19.13)
352
350
 
353
351
  ## How does it work?
354
352
 
@@ -364,11 +362,11 @@ It streamlines development, training, and inference, and is compatible with any
364
362
 
365
363
  To orchestrate compute across cloud providers or existing Kubernetes clusters, you need to configure backends.
366
364
 
367
- Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](../concepts/projects.md#backends) in the UI.
365
+ Backends can be set up in `~/.dstack/server/config.yml` or through the [project settings page](https://dstack.ai/docs/concepts/projects#backends) in the UI.
368
366
 
369
- For more details, see [Backends](../concepts/backends.md).
367
+ For more details, see [Backends](https://dstack.ai/docs/concepts/backends).
370
368
 
371
- > When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](../concepts/fleets.md#ssh) once the server is up.
369
+ > When using `dstack` with on-prem servers, backend configuration isn’t required. Simply create [SSH fleets](https://dstack.ai/docs/concepts/fleets#ssh) once the server is up.
372
370
 
373
371
  ##### Start the server
374
372
 
@@ -1,5 +1,5 @@
1
1
  dstack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- dstack/version.py,sha256=bfw1WiD5UTLEsyy2XkGQGNyKevTEg-OnV98FYK9gm7Q,105
2
+ dstack/version.py,sha256=DLiOZq8Gabr_DjHGIzjxI9IasDON-4xNaF3b4Rt2BBI,105
3
3
  dstack/_internal/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  dstack/_internal/compat.py,sha256=bF9U9fTMfL8UVhCouedoUSTYFl7UAOiU0WXrnRoByxw,40
5
5
  dstack/_internal/settings.py,sha256=FYtd7tRk17Oc62Kl_3O8NuT5JHb8TKhLThl1TsfjjVs,1390
@@ -118,12 +118,12 @@ dstack/_internal/core/backends/gcp/features/tcpx.py,sha256=8bDR5kwF5qke5EWNdBscd
118
118
  dstack/_internal/core/backends/hotaisle/__init__.py,sha256=CYMaS1jd9Km0Y6Jvg4ePjYOtfqL9swGsRo5kcXGFrFQ,30
119
119
  dstack/_internal/core/backends/hotaisle/api_client.py,sha256=Fd1TOg4_orwQyJtoZ657zJweLeBhzj_9ObfL538S5uI,3640
120
120
  dstack/_internal/core/backends/hotaisle/backend.py,sha256=o0cqLIKGcrXhvksHHGvjCpLShoQxT2IKdJy9sm0H9gE,586
121
- dstack/_internal/core/backends/hotaisle/compute.py,sha256=y72Mmzhq2xVGc5tKK3k7_ovog8_vXVngfLhcIvH-p2I,7551
121
+ dstack/_internal/core/backends/hotaisle/compute.py,sha256=X9XbIatbFH5wqLoSH3Z9nNOhBMrXnVVayFn6xi4zu-g,8224
122
122
  dstack/_internal/core/backends/hotaisle/configurator.py,sha256=EJwdKFfC0ab0pe4lzeV65b80Ok21rR0OfupOmuqCp6c,2287
123
123
  dstack/_internal/core/backends/hotaisle/models.py,sha256=CmJ20SbpKzFldX7rrR0CpVytSJSN2YWKQ3Ixnta_A1M,1334
124
124
  dstack/_internal/core/backends/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
125
125
  dstack/_internal/core/backends/kubernetes/backend.py,sha256=Jy0_Nwn6Oro8McJIo_QeNxxq4Pmwsd7JPd5_YE8Fz9U,606
126
- dstack/_internal/core/backends/kubernetes/compute.py,sha256=AiZk5uWtON-QXyi4iVC1InmkNmtUhE6mJXWfCKF8KG0,32428
126
+ dstack/_internal/core/backends/kubernetes/compute.py,sha256=7xVdo2HK-dTZjkQQtuIfiTXLhBSzhUO0BomZBaPG5UM,34989
127
127
  dstack/_internal/core/backends/kubernetes/configurator.py,sha256=RK8_eznv1AFrcG3fM-KIxyolsaJ8UTBAO7c3P3RCBnw,2228
128
128
  dstack/_internal/core/backends/kubernetes/models.py,sha256=vGOhRYP4OzhF62BN5bfRGd4E2tKPaqdlZY8tMmjZoJ0,2308
129
129
  dstack/_internal/core/backends/kubernetes/utils.py,sha256=1DkkL_VWShFFqN-Crh0ddebRyXXyL435FyrjVkFLR1Q,6286
@@ -138,11 +138,11 @@ dstack/_internal/core/backends/local/backend.py,sha256=KJuNXUXrg60NhLywnExD1EXH2
138
138
  dstack/_internal/core/backends/local/compute.py,sha256=tWNsKGKYlPx9yeqwlpAL_XExOYMPLcb6AsGAji3YO3M,3825
139
139
  dstack/_internal/core/backends/nebius/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
140
  dstack/_internal/core/backends/nebius/backend.py,sha256=2XqZIbSR8VzlfOnuVklXlDxNmwAkQj7txQN8VXF1j2E,566
141
- dstack/_internal/core/backends/nebius/compute.py,sha256=bBfNai_GkrHzWHnRRnBkUObQxV4aD_Fog9eQiaPL0Kw,14920
142
- dstack/_internal/core/backends/nebius/configurator.py,sha256=PilZ5M0xj-koYz9PPew9L29rrHoF2JrW2bxgCqt27u4,3213
141
+ dstack/_internal/core/backends/nebius/compute.py,sha256=US5W8Q0UT09huQybDTBLVE2EDpRG3UdMqq1DZFFaCI4,15454
142
+ dstack/_internal/core/backends/nebius/configurator.py,sha256=eybolJi5rlEeU8GBXC7pdOU7To32ASQGHDAiE2cNeFo,3794
143
143
  dstack/_internal/core/backends/nebius/fabrics.py,sha256=-X-nSPV2pUin2PAYDHGm-j14KPboIFRpLi93PKHUXTM,1616
144
- dstack/_internal/core/backends/nebius/models.py,sha256=UudYX32p-ZY-GWR83VEtY5dpZBaWhKXQIfn2nrBCq-4,4245
145
- dstack/_internal/core/backends/nebius/resources.py,sha256=ttgwdqokvXF8BH_IDPFZxWqr1uAMpdO3_Q31VleiXvk,12731
144
+ dstack/_internal/core/backends/nebius/models.py,sha256=OSiUANBf893Xdm7-4WDoPfmd3YFk5-oRjdXiWUjvDdk,6194
145
+ dstack/_internal/core/backends/nebius/resources.py,sha256=MVyMaS0-mZu2g-tJ4HF7GiT1hFFL7Mha9hXtP3XeT7o,14070
146
146
  dstack/_internal/core/backends/oci/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
147
147
  dstack/_internal/core/backends/oci/auth.py,sha256=8Cr18y_LOsyRP-16yfFpT70Cofpm0clB3KawS_7aRl4,717
148
148
  dstack/_internal/core/backends/oci/backend.py,sha256=yXjVCt7n6BVLH0byYFbNFf-P9J0FwlNfxsYbKGMdoI4,536
@@ -182,7 +182,7 @@ dstack/_internal/core/compatibility/fleets.py,sha256=jg42A7OmprqATKKt6JpLL1qOQSZ
182
182
  dstack/_internal/core/compatibility/gateways.py,sha256=4h_lfpN9KJFyLTFexq-wlu74Rwpk0anV67v38aJ-SnI,1463
183
183
  dstack/_internal/core/compatibility/gpus.py,sha256=myWVUjaK2S1QuYgRZyMtD8-DPKQTjadSbsnECtfoHHs,575
184
184
  dstack/_internal/core/compatibility/logs.py,sha256=keXt3OFKR0CjD_XMsetzRu8yQGCz7CWBwycuP267L_Q,629
185
- dstack/_internal/core/compatibility/runs.py,sha256=kdYvirgXWO2m9Uj_LOadzMUsJSWXtFY6yEdNadNypZs,8869
185
+ dstack/_internal/core/compatibility/runs.py,sha256=pT5RxheOzJJXFpJM1th-ku9-inj3McMBcdEHxcMBp9U,9357
186
186
  dstack/_internal/core/compatibility/volumes.py,sha256=ofjpVusuc-pq285bGrIh8PAqu0QlAd6NQgU3gfJQIc0,1546
187
187
  dstack/_internal/core/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
188
188
  dstack/_internal/core/models/common.py,sha256=QKdZM7L2NepzOPavkkI_q3g6WYCauMTbtZSZwWZVYHE,4704
@@ -197,7 +197,7 @@ dstack/_internal/core/models/instances.py,sha256=Gpv46fu3uWO-3f8w1A6rBzU5dYhmO_w
197
197
  dstack/_internal/core/models/logs.py,sha256=VOsgEsvUIRNNHivD6OZnPZNC52ioqafv7ccdnFQ1YI8,529
198
198
  dstack/_internal/core/models/metrics.py,sha256=Xb8hCXUL-ncQ3PMsErIUAJTe9gwh5jyrQ4UQoZbibsc,269
199
199
  dstack/_internal/core/models/placement.py,sha256=WJVq5ENJykyRarQzL2EeYQag_9_jV7VSAtR_xoFvPVM,720
200
- dstack/_internal/core/models/profiles.py,sha256=_ZxSk-rMvzpiUYUZD0EQbrJDy3FVmRUmIUtEgzlVeHo,14217
200
+ dstack/_internal/core/models/profiles.py,sha256=YW7XeztKBnVCptvUyR8-E-dbB06nu9GIqwLvmtHgSms,14501
201
201
  dstack/_internal/core/models/projects.py,sha256=hOZoL85q-873vT_Aw7FhzpS6DGVt0Y3yT8kpElrLFto,833
202
202
  dstack/_internal/core/models/resources.py,sha256=-dLupzud5BSqxNABBjLVYTCKekbr9_mhaNGD1ZWBjgM,14544
203
203
  dstack/_internal/core/models/runs.py,sha256=ehGSyCSx5OAaEqCEd2YvCRiP_uewUeDOHbJGSocu6w0,22609
@@ -289,17 +289,17 @@ dstack/_internal/server/settings.py,sha256=7SRzSlTnUPNNjlZH-vpgwbwj95gI-LLtQite8
289
289
  dstack/_internal/server/background/__init__.py,sha256=QftEjgQZffu83sY-F0WL65vRp28FbBEtezfowQYcTv4,5606
290
290
  dstack/_internal/server/background/tasks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
291
291
  dstack/_internal/server/background/tasks/common.py,sha256=n87hFjDNtS2x8mYbBnKLqhXus1P8qkdfqXG1TSeIJjM,1089
292
- dstack/_internal/server/background/tasks/process_fleets.py,sha256=b0wECmweaetFv9vSjE1BiOJmlIPlk7O-fmntpdSiWgg,8590
292
+ dstack/_internal/server/background/tasks/process_fleets.py,sha256=0i_S3HCZp4AjQjER7j_pvIm22eYbrBwZnt9-0kgsq3E,11547
293
293
  dstack/_internal/server/background/tasks/process_gateways.py,sha256=FH9RY3Tfmtw_UctCdYZDIRb2rgtmHdxTg6Oc4IBiDBA,8356
294
294
  dstack/_internal/server/background/tasks/process_idle_volumes.py,sha256=mqnl8wvWaKTYvJMbgFJbOP-bZMRQG2vrhUnaNcyBldE,5223
295
- dstack/_internal/server/background/tasks/process_instances.py,sha256=siltiaMbm1zUi4r0CwUODj_8iRx5twJpVAp4cP05n6w,44059
295
+ dstack/_internal/server/background/tasks/process_instances.py,sha256=IeKl28NSC-va5QABW0PC3eDGU6r-Xz1TNuSK2EtIX08,44007
296
296
  dstack/_internal/server/background/tasks/process_metrics.py,sha256=yKXe9J7m3oleK0C-oGJaYNkcPT8kqkz0nw-A7xqYbjE,6390
297
297
  dstack/_internal/server/background/tasks/process_placement_groups.py,sha256=lgYIzjHG9EITK31yG6uQjlIcSwW5jsP9ZOBBZqW_eNs,4263
298
298
  dstack/_internal/server/background/tasks/process_probes.py,sha256=dmug-_rmYiVLLF-imto-Ju1gPtENvHvCjHyilqgYuJw,6457
299
299
  dstack/_internal/server/background/tasks/process_prometheus_metrics.py,sha256=_UZm37FVV4rhdd0So7HtcKbIgrSdAr5Vx-Uen_xizec,5459
300
300
  dstack/_internal/server/background/tasks/process_running_jobs.py,sha256=IoQi7mm4upEZgujTkWYrXDKrC5rSZ5Q4_jAR4OpajaM,44973
301
- dstack/_internal/server/background/tasks/process_runs.py,sha256=Cx7Z1B7pZVlvCl-OsIaAiIMFG_aZDdn3nlZeha6k2x4,25041
302
- dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=XxPapMdCsuA_H_X27SIwIZFd0Y5jzwvIABnhqa-kwyQ,41098
301
+ dstack/_internal/server/background/tasks/process_runs.py,sha256=K4km4XT0JYUf6JYbpKbEAyumUDBT21lqcMFTQ7pIsoY,25200
302
+ dstack/_internal/server/background/tasks/process_submitted_jobs.py,sha256=fDuLfnSJqWDL2oPrxXQMkaqKgplMdKT0SRD_AyB4n-0,43099
303
303
  dstack/_internal/server/background/tasks/process_terminating_jobs.py,sha256=S7ZSDVMX-N0XMaMgwFa1QG_RAi48BP432s9AqHw4PMM,4066
304
304
  dstack/_internal/server/background/tasks/process_volumes.py,sha256=_fMmkwLYsyX-kpW9pDrZVJvFTZEOPp0gpjyKBMW-zw0,5204
305
305
  dstack/_internal/server/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -662,8 +662,8 @@ dstack/plugins/builtin/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
662
662
  dstack/plugins/builtin/rest_plugin/__init__.py,sha256=lgTsq8Z6Km2F2UhPRChVB4vDM5ZpWtdk1iB1aa20ypA,440
663
663
  dstack/plugins/builtin/rest_plugin/_models.py,sha256=9hgVuU6OGSxidar88XhQnNo9izYWeQvVH45ciErv-Es,1910
664
664
  dstack/plugins/builtin/rest_plugin/_plugin.py,sha256=h3r3Yc3h22i93fifPTgTm9Oojd1sN1O4DP7ZTV-kWpM,5386
665
- dstack-0.19.31.dist-info/METADATA,sha256=espPx6ZPYMP95O6EOFv7PmhkMdmjxCTn-yptJZxPER4,21085
666
- dstack-0.19.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
667
- dstack-0.19.31.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
668
- dstack-0.19.31.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
669
- dstack-0.19.31.dist-info/RECORD,,
665
+ dstack-0.19.32.dist-info/METADATA,sha256=R_c6-NfPoaFBeuJVdHTD6dENAKZZsBo_syd6wCdiJ6M,20834
666
+ dstack-0.19.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
667
+ dstack-0.19.32.dist-info/entry_points.txt,sha256=GnLrMS8hx3rWAySQjA7tPNhtixV6a-brRkmal1PKoHc,58
668
+ dstack-0.19.32.dist-info/licenses/LICENSE.md,sha256=qDABaRGjSKVOib1U8viw2P_96sIK7Puo426784oD9f8,15976
669
+ dstack-0.19.32.dist-info/RECORD,,